<a href="https://colab.research.google.com/github/kavya6170/AI-Agent/blob/main/Project_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U \
langchain \
langchain-community \
langchain-core \
langchain-google-genai \
google-generativeai \
sentence-transformers \
faiss-cpu \
pypdf

Collecting langchain-community
  Downloading langchain_community-0.4.1-py3-none-any.whl.metadata (3.0 kB)
Collecting langchain-core
  Downloading langchain_core-1.2.5-py3-none-any.whl.metadata (3.7 kB)
Collecting langchain-google-genai
  Downloading langchain_google_genai-4.1.2-py3-none-any.whl.metadata (2.7 kB)
Collecting google-generativeai
  Downloading google_generativeai-0.8.6-py3-none-any.whl.metadata (3.9 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.13.2-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.6 kB)
Collecting pypdf
  Downloading pypdf-6.5.0-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-classic<2.0.0,>=1.0.0 (from langchain-community)
  Downloading langchain_classic-1.0.1-py3-none-any.whl.metadata (4.2 kB)
Collecting requests<3.0.0,>=2.32.5 (from langchain-community)
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting dataclasses-json<0.7.0,>=0.6.7 (from langchain-community)
  Downloading dataclasses_json-0.6

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import faiss
import numpy as np
import pickle
import json
import joblib

from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.prompts import ChatPromptTemplate
from sentence_transformers import SentenceTransformer
from langchain_google_genai import ChatGoogleGenerativeAI
from sklearn.metrics.pairwise import cosine_similarity

# ======================================================
# GEMINI API KEY
# ======================================================
os.environ["GOOGLE_API_KEY"] = "AIzaSyBTOQynnT2ZivauDAvp6TkM1FTOSv9QEWE"

# ======================================================
# PATHS & STORAGE
# ======================================================
DATA_DIR = "/content/drive/MyDrive/Project" # data path
VECTOR_DIR = "/content/drive/MyDrive/vector_store"# this is the path where embeddings will be saved

DOC_INDEX_PATH = os.path.join(VECTOR_DIR, "doc_index.faiss")# to store the index of vectors
TEXTS_PATH = os.path.join(VECTOR_DIR, "texts.pkl")# chunk texts
METADATA_PATH = os.path.join(VECTOR_DIR, "metadata.pkl")# chunk metadata
INDEXED_FILES_PATH = os.path.join(VECTOR_DIR, "indexed_files.json")#list of indexed PDFs

RF_MODEL_PATH = "/content/drive/MyDrive/hallucination_RF_model.pkl"

os.makedirs(VECTOR_DIR, exist_ok=True) # if the vector is not existing then it will create

# ======================================================
# CONFIGURATION
# ======================================================
CHUNK_SIZE = 600 # chunk size
CHUNK_OVERLAP = 120 # chunk overlap

TOP_K = 3 #Retrieve top-3 most similar chunks
MAX_CONTEXT_CHARS = 1600 #Prevents exceeding LLM context limits
SIMILARITY_THRESHOLD = 2.2 #FAISS L2 distance threshold
MEMORY_SIMILARITY_THRESHOLD = 1.2 #Controls memory reuse sensitivity
MAX_SUBQUERIES = 6 #Upper limit for query decomposition

DISALLOWED_KEYWORDS = [
    "rag", "llm", "ai", "machine learning",
    "deep learning", "transformer"
]

# Embedding model (MUST match training)
embedder = SentenceTransformer("all-MiniLM-L6-v2")
dimension = embedder.get_sentence_embedding_dimension()

# Load trained Random Forest model
rf_model = joblib.load(RF_MODEL_PATH)
print("‚úÖ Random Forest hallucination model loaded")

# Gemini LLM
llm = ChatGoogleGenerativeAI(
    model="gemini-flash-latest",
    temperature=0.1
)

# ======================================================
# LOAD INDEXED FILES
# ======================================================
if os.path.exists(INDEXED_FILES_PATH):
    with open(INDEXED_FILES_PATH, "r") as f:
        indexed_files = set(json.load(f))
else:
    indexed_files = set()

# ======================================================
# LOAD OR CREATE FAISS INDEX
# ======================================================
if os.path.exists(DOC_INDEX_PATH):
    print("üîÑ Loading existing FAISS index...")
    doc_index = faiss.read_index(DOC_INDEX_PATH)

    with open(TEXTS_PATH, "rb") as f:
        texts = pickle.load(f)

    with open(METADATA_PATH, "rb") as f:
        metadata = pickle.load(f)
else:
    print("üÜï Creating new FAISS index...")
    doc_index = faiss.IndexFlatL2(dimension)
    texts = []
    metadata = []

# ======================================================
# INCREMENTAL DOCUMENT INGESTION
# ======================================================
new_documents = []

for file in os.listdir(DATA_DIR):
    if file.endswith(".pdf") and file not in indexed_files:
        loader = PyPDFLoader(os.path.join(DATA_DIR, file))
        pages = loader.load()
        for p in pages:
            p.metadata["source"] = file
        new_documents.extend(pages)
        indexed_files.add(file)

if new_documents:
    print(f"üÜï Found {len(new_documents)} new pages")

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP
    )

    new_chunks = splitter.split_documents(new_documents)
    new_texts = [c.page_content for c in new_chunks]
    new_metadata = [c.metadata for c in new_chunks]

    new_embeddings = embedder.encode(new_texts, show_progress_bar=True)

    doc_index.add(np.array(new_embeddings))
    texts.extend(new_texts)
    metadata.extend(new_metadata)

    faiss.write_index(doc_index, DOC_INDEX_PATH)

    with open(TEXTS_PATH, "wb") as f:
        pickle.dump(texts, f)

    with open(METADATA_PATH, "wb") as f:
        pickle.dump(metadata, f)

    with open(INDEXED_FILES_PATH, "w") as f:
        json.dump(list(indexed_files), f)

    print("‚úÖ New embeddings added & saved")
else:
    print("‚úÖ No new documents found")

# ======================================================
# MEMORY VECTOR STORE
# ======================================================
memory_texts = []
memory_index = faiss.IndexFlatL2(dimension)

def search_memory(question):
    if not memory_texts:
        return None
    q_emb = embedder.encode([question])
    dist, idx = memory_index.search(np.array(q_emb), 1)
    if dist[0][0] < MEMORY_SIMILARITY_THRESHOLD:
        return memory_texts[idx[0][0]]
    return None

def store_memory(question, answer):
    entry = f"Q: {question}\nA: {answer}"
    emb = embedder.encode([entry])
    memory_index.add(np.array(emb))
    memory_texts.append(entry)

def retrieve_chunks(query):
    q_emb = embedder.encode([query])
    distances, indices = doc_index.search(np.array(q_emb), TOP_K)

    results = []
    for idx, dist in zip(indices[0], distances[0]):
        if dist < SIMILARITY_THRESHOLD:
            results.append({
                "text": texts[idx],
                "source": metadata[idx]["source"]
            })
    return results


# ======================================================
# PROMPTS
# ======================================================
subquery_prompt = ChatPromptTemplate.from_template("""
Break the question into 3‚Äì6 focused sub-questions.
Do NOT answer.

Question:
{question}

Return only a numbered list.
""")

def split_into_subqueries(question):
    resp = llm.invoke(subquery_prompt.format_messages(question=question))
    lines = resp.content.split("\n")

    subs = []
    for l in lines:
        if l.strip() and l[0].isdigit():
            subs.append(l.split(".", 1)[1].strip())

    return subs[:MAX_SUBQUERIES]


answer_prompt = ChatPromptTemplate.from_template("""
You are a document-grounded enterprise policy assistant.

Rules:
- Use ONLY provided context
- No external knowledge
- Elaborate answer in 4-5 lines
- give it in bullet points if needed
- If missing, say exactly:
  "Information not found in the provided documents."

Context:
{context}

Question:
{question}

Answer:
""")


#RANDOM FOREST FEATURE EXTRACTION
def token_overlap_ratio(answer, context):
    a = set(answer.lower().split())
    c = set(context.lower().split())
    if not a:
        return 0.0
    return len(a & c) / len(a)

def extract_rf_features(question, context, answer):
    q_emb = embedder.encode(question)
    c_emb = embedder.encode(context)
    a_emb = embedder.encode(answer if answer else question)

    return np.array([
        cosine_similarity([q_emb], [c_emb])[0][0],
        cosine_similarity([a_emb], [c_emb])[0][0],
        cosine_similarity([q_emb], [a_emb])[0][0],
        token_overlap_ratio(answer, context),
        len(context.split()),
        len(answer.split())
    ]).reshape(1, -1)


def extract_text_from_response(response):
    content = response.content

    # Case 1: content is already a string
    if isinstance(content, str):
        return content.strip()

    # Case 2: Gemini returns list of parts
    if isinstance(content, list):
        texts = []
        for part in content:
            if isinstance(part, dict) and "text" in part:
                texts.append(part["text"])
            elif hasattr(part, "text"):
                texts.append(part.text)
        return " ".join(texts).strip()

    # Fallback
    return str(content).strip()

def is_definition_question(question):
    q = question.lower().strip()
    return q.startswith("what is") or q.startswith("define")


def is_procedure_question(question):
    procedure_keywords = [
        "how to", "how do i", "process", "procedure",
        "steps", "file", "complaint", "apply"
    ]
    q = question.lower()
    return any(k in q for k in procedure_keywords)


def is_scenario_question(question):
    scenario_starters = [
        "can i", "is it allowed", "is it okay",
        "what if", "suppose", "would it be",
        "am i allowed"
    ]
    q = question.lower()
    return any(q.startswith(s) for s in scenario_starters)


# ======================================================
# MAIN RAG FUNCTION
# ======================================================
def rag_answer(question):

    # ===============================
    # 1Ô∏è‚É£ MEMORY LOOKUP (skip scenarios)
    # ===============================
    def is_cross_policy_question(question):
      keywords = [
        "interact", "between", "relation", "difference",
        "versus", "vs", "compare", "different",
        "exempt", "apply to", "applicable to"
      ]
      q = question.lower()
      return any(k in q for k in keywords)


    # ===============================
    # 1Ô∏è‚É£ MEMORY LOOKUP (SAFE CASES)
    # ===============================
    if (
    not is_scenario_question(question)
    and not is_cross_policy_question(question)
    ):
      memory_hit = search_memory(question)
      if memory_hit:
        return memory_hit, {"memory"}, {
            "hallucinated": False,
            "confidence": None,
            "reason": "memory_reuse"

        }


    # ===============================
    # 2Ô∏è‚É£ DISALLOWED KEYWORDS
    # ===============================
    if any(w in question.lower() for w in DISALLOWED_KEYWORDS):
      return "Information not found in the provided documents.", set(), {
    "hallucinated": True,
    "confidence": None,
    "reason": "unanswerable"
    }


    # ===============================
    # 3Ô∏è‚É£ RETRIEVAL STRATEGY (INTENT-AWARE)
    # ===============================
    retrieved = []

    if is_definition_question(question):
        retrieved = retrieve_chunks(question)

    elif is_scenario_question(question):
        retrieved = retrieve_chunks(question)

    elif is_procedure_question(question):
        for sq in split_into_subqueries(question):
            retrieved.extend(retrieve_chunks(sq))

    else:
        retrieved = retrieve_chunks(question)

    if not retrieved:
        return "Information not found in the provided documents.", set(), {
    "hallucinated": True,
    "confidence": None,
    "reason": "unanswerable"
    }


    # ===============================
    # 4Ô∏è‚É£ CONTEXT BUILDING
    # ===============================
    context = ""
    sources = set()

    for r in retrieved:
        if len(context) + len(r["text"]) > MAX_CONTEXT_CHARS:
            break
        context += r["text"] + "\n\n"
        sources.add(r["source"])

    # Scenario-specific caution
    if is_scenario_question(question):
        context = (
            "Use the following policy rules, restrictions, "
            "and conditions to answer cautiously.\n\n"
            + context
        )

    # ===============================
    # 5Ô∏è‚É£ LLM CALL (ChatPromptTemplate)
    # ===============================
    response = llm.invoke(
        answer_prompt.format_messages(
            context=context,
            question=question
        )
    )

    # Gemini-safe extraction
    answer = extract_text_from_response(response)

    # ===============================
    # 6Ô∏è‚É£ RANDOM FOREST HALLUCINATION CHECK
    # ===============================
    X = extract_rf_features(
        question=question,
        context=context,
        answer=answer
    )

    rf_prob = rf_model.predict_proba(X)[0][1]

    # stricter threshold for scenario questions
    threshold = 0.75 if is_scenario_question(question) else 0.6
    MIN_CONFIDENCE = 0.6

    rule_flag = False

    if "matrix" in answer.lower() and "matrix" not in context.lower():
      rule_flag = True

    if "http" in answer.lower() and "http" not in context.lower():
      rule_flag = True

    UNANSWERABLE_TEXT = "information not found in the provided documents."

    # ===============================
    # FINAL DECISION LOGIC (3 STATES)
    # ===============================

    if answer.strip().lower() == UNANSWERABLE_TEXT:
      rf_result = {
        "hallucinated": True,
        "confidence": round(rf_prob, 3),
        "reason": "unanswerable"
      }

    elif rf_prob >= threshold or rule_flag:
      rf_result = {
        "hallucinated": True,
        "confidence": round(rf_prob, 3),
        "reason": "hallucination"
      }

    elif rf_prob < MIN_CONFIDENCE:
      rf_result = {
        "hallucinated": True,
        "confidence": round(rf_prob, 3),
        "reason": "low_confidence"
      }

    elif rf_prob == 0.0:
      rf_result = {
        "hallucinated": True,
        "confidence": 0.0,
        "reason": "unvalidated"
    }

    else:
      rf_result = {
        "hallucinated": False,
        "confidence": round(rf_prob, 3),
        "reason": "grounded"
      }


    # ===============================
    # 7Ô∏è‚É£ MEMORY STORAGE (SAFE ONLY)
    # ===============================
    if (
    not is_scenario_question(question)
    and not is_cross_policy_question(question)
    and answer.strip() != "Information not found in the provided documents."
    ):
      store_memory(question, answer)



    # ===============================
    # 8Ô∏è‚É£ FINAL RETURN
    # ===============================
    return answer, sources, rf_result



# ======================================================
# CHAT LOOP
# ======================================================
print("\n‚úÖ RAG + MEMORY + FAISS + GEMINI + RF READY (COLAB)\n")

while True:
    q = input("‚ùì Ask a question: ")
    if q.lower() == "exit":
        break

    answer, sources, rf_result = rag_answer(q)

    print("\nüß† Answer:\n", answer)

    # ===============================
    # HALLUCINATION STATUS
    # ===============================
    if rf_result is None:
      print("\n‚ö†Ô∏è No confidence evaluation available")
    else:
      if rf_result["reason"] == "grounded":
          print(f"\n‚úÖ Answer is grounded (confidence={rf_result['confidence']})")

      elif rf_result["reason"] == "memory_reuse":
          print("\nüß† Answer reused from memory (cached)")

      elif rf_result["reason"] == "unanswerable":
          print("\n‚ö†Ô∏è Answer not found in documents")

      elif rf_result["reason"] == "low_confidence":
          print(f"\n‚ö†Ô∏è Answer unreliable (confidence={rf_result['confidence']})")

      else:
          print(f"\n‚ùå Possible hallucination (confidence={rf_result['confidence']})")

    # ===============================
    # SOURCES
    # ===============================
    if sources:
        print("\nüìö Sources:")
        for s in sources:
            print(" -", s)

    print("\n" + "=" * 80)



‚úÖ Random Forest hallucination model loaded
üÜï Creating new FAISS index...
üÜï Found 127 new pages


Batches:   0%|          | 0/19 [00:00<?, ?it/s]

‚úÖ New embeddings added & saved

‚úÖ RAG + MEMORY + FAISS + GEMINI + RF READY (COLAB)

‚ùì Ask a question: what is posh policy?


ChatGoogleGenerativeAIError: Error calling model 'gemini-flash-latest' (RESOURCE_EXHAUSTED): 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/usage?tab=rate-limit. \n* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 20, model: gemini-2.5-flash\nPlease retry in 48.49834884s.', 'status': 'RESOURCE_EXHAUSTED', 'details': [{'@type': 'type.googleapis.com/google.rpc.Help', 'links': [{'description': 'Learn more about Gemini API quotas', 'url': 'https://ai.google.dev/gemini-api/docs/rate-limits'}]}, {'@type': 'type.googleapis.com/google.rpc.QuotaFailure', 'violations': [{'quotaMetric': 'generativelanguage.googleapis.com/generate_content_free_tier_requests', 'quotaId': 'GenerateRequestsPerDayPerProjectPerModel-FreeTier', 'quotaDimensions': {'location': 'global', 'model': 'gemini-2.5-flash'}, 'quotaValue': '20'}]}, {'@type': 'type.googleapis.com/google.rpc.RetryInfo', 'retryDelay': '48s'}]}}

In [None]:
import os

for f in [
    DOC_INDEX_PATH,
    TEXTS_PATH,
    METADATA_PATH,
    INDEXED_FILES_PATH
]:
    if os.path.exists(f):
        os.remove(f)

print("Old FAISS index deleted. Re-run notebook.")

Old FAISS index deleted. Re-run notebook.
