In [56]:
#!pip install pinecone-client==2.2.4
#!pip install -U pinecone pypdf sentence-transformers ddgs groq sqlite-utils



In [8]:
#CELL 2 — Imports & API keys
import os
import uuid
import sqlite3
from datetime import datetime

from pypdf import PdfWriter, PdfReader
from sentence_transformers import SentenceTransformer
from ddgs import DDGS
from groq import Groq

from pinecone import Pinecone, ServerlessSpec





In [41]:
PINECONE_API_KEY = " "
#PINECONE_ENV = "us-east-1-aws"
GROQ_API_KEY = " "


pc = Pinecone(api_key=PINECONE_API_KEY)
groq_client = Groq(api_key=GROQ_API_KEY)


In [21]:
#pc.delete_index("rag-lob-index")


In [26]:
#CELL 4 — Create Pinecone Index (Serverless)
INDEX_NAME = "rag-lob-index"
DIMENSION = 384  #  FIXED

pc.create_index(
    name=INDEX_NAME,
    dimension=DIMENSION,
    metric="cosine",
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    )
)

index = pc.Index(INDEX_NAME)



In [27]:
#CELL 5 — Create LOB Folder Hierarchy + PDFs
BASE_DIR = "lob_docs"
LOBS = [
    "banking", "insurance", "pharma", "automobile",
    "energy", "cybersecurity", "environment"
]

os.makedirs(BASE_DIR, exist_ok=True)

def create_pdf(path, text):
    writer = PdfWriter()
    writer.add_blank_page(width=612, height=792)
    with open(path, "wb") as f:
        writer.write(f)
    with open(path.replace(".pdf", ".txt"), "w") as f:
        f.write(text)

for lob in LOBS:
    lob_path = os.path.join(BASE_DIR, lob)
    os.makedirs(lob_path, exist_ok=True)

    for i in range(2):
        content = f"""
        This document belongs to {lob.upper()} domain.
        It explains policies, workflows, risks, compliance,
        operational challenges and domain-specific terminology.
        """
        create_pdf(f"{lob_path}/{lob}_{i}.pdf", content)


In [28]:
#CELL 6 — Chunking Strategy (EXPLAINABLE)
def chunk_text(text, chunk_size=400, overlap=50):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunks.append(" ".join(words[i:i+chunk_size]))
    return chunks


In [29]:
#CELL 7 — Embedding + Pinecone Upser
embedder = SentenceTransformer("all-MiniLM-L6-v2")

def ingest_documents():
    for lob in os.listdir(BASE_DIR):
        lob_path = os.path.join(BASE_DIR, lob)
        for file in os.listdir(lob_path):
            if file.endswith(".txt"):
                with open(os.path.join(lob_path, file)) as f:
                    text = f.read()

                chunks = chunk_text(text)

                vectors = []
                for chunk in chunks:
                    vector_id = str(uuid.uuid4())  # VECTOR ID
                    embedding = embedder.encode(chunk).tolist()

                    vectors.append({
                        "id": vector_id,
                        "values": embedding,
                        "metadata": {
                            "lob": lob,
                            "source": file,
                            "text": chunk
                        }
                    })

                index.upsert(vectors=vectors)

ingest_documents()


In [30]:
#CELL 8 — SQLite Chat Memory (User + Session)
conn = sqlite3.connect("chat.db")
cur = conn.cursor()

cur.execute("""
CREATE TABLE IF NOT EXISTS chat_history (
    user_id TEXT,
    session_id TEXT,
    query TEXT,
    response TEXT,
    timestamp DATETIME
)
""")
conn.commit()


In [31]:
#CELL 9 — Fetch Last 10 Chats (Context Memory)
def fetch_last_n(user_id, session_id, n=10):
    cur.execute("""
    SELECT query, response FROM chat_history
    WHERE user_id=? AND session_id=?
    ORDER BY timestamp DESC LIMIT ?
    """, (user_id, session_id, n))
    rows = cur.fetchall()[::-1]
    return "\n".join([f"User: {q}\nAssistant: {r}" for q, r in rows])


In [32]:
#CELL 11 — Vector RAG Retrieval
def vector_search(query):
    q_emb = embedder.encode(query).tolist()
    result = index.query(vector=q_emb, top_k=3, include_metadata=True)

    return "\n".join([m["metadata"]["text"] for m in result["matches"]])


In [33]:
#CELL 12 — Web Search Tool (DuckDuckGo)
def web_search(query):
    with DDGS() as ddgs:
        results = ddgs.text(query, max_results=3)
        return "\n".join([r["body"] for r in results])


In [34]:
#CELL 13 — Guardrails
def guardrails(text):
    banned = ["violence", "hate", "personal data"]
    for b in banned:
        if b in text.lower():
            return "Content blocked by policy."
    return text


In [35]:
#CELL 14 — Main Orchestrator (FINAL)
def handle_query(user_id, session_id, query):
    history = fetch_last_n(user_id, session_id)

    route = route_query(query)

    if route == "VECTOR":
        context = vector_search(query)
    else:
        context = web_search(query)

    prompt = f"""
    Chat history:
    {history}

    Context:
    {context}

    Answer the query:
    {query}
    """

    res = groq_client.chat.completions.create(
        model="llama-3.1-8b-instant",
        messages=[{"role": "user", "content": prompt}]
    )

    answer = guardrails(res.choices[0].message.content)

    cur.execute(
        "INSERT INTO chat_history VALUES (?, ?, ?, ?, ?)",
        (user_id, session_id, query, answer, datetime.utcnow())
    )
    conn.commit()

    return answer


In [38]:
# ======================================
# ROUTING AGENT
# Decides where the query should go
# ======================================

def route_query(query: str) -> str:
    """
    Routes query to:
    - VECTOR: domain / LOB / document-related queries
    - SEARCH: general knowledge queries
    """

    domain_keywords = [
        "policy", "claim", "aml", "banking", "insurance",
        "pharma", "energy", "automobile", "cyber",
        "environment", "marine", "technical"
    ]

    query_lower = query.lower()

    for keyword in domain_keywords:
        if keyword in query_lower:
            return "VECTOR"

    return "SEARCH"


In [39]:
#CELL 15 — MULTI-USER TEST
handle_query("user_1", "session_1", "Explain AML process in banking")


  (user_id, session_id, query, answer, datetime.utcnow())
  cur.execute(


'**Anti-Money Laundering (AML) Process in Banking**\n\nThe Anti-Money Laundering (AML) process is a critical aspect of banking operations, aimed at preventing and detecting money laundering activities. In this section, we will outline the AML process in banking, its policies, workflows, risks, compliance, operational challenges, and relevant terminology.\n\n**Objective of AML Process**\n\nThe primary objective of the AML process is to prevent and detect money laundering activities, ensuring that financial institutions comply with regulatory requirements and adhere to strict laws and regulations.\n\n**AML Policies**\n\nBanks develop and implement AML policies that outline procedures for:\n\n1. **Customer Due Diligence (CDD)**: Verifying the identity of customers, understanding their business and financial activities, and assessing their risk level.\n2. **Transaction Monitoring**: Identifying and filtering suspicious transactions, such as those with unusual patterns or high-risk countrie

In [40]:
handle_query("user_2", "session_99", "What is quantum computing?")


  (user_id, session_id, query, answer, datetime.utcnow())
  cur.execute(


'Quantum computing is a rapidly-emerging technology that harnesses the laws of quantum mechanics to solve problems too complex for classical computers.'

In [42]:
import uuid

# Multi-user session tracking
users = {}  # user_id -> session_id

def get_session(user_id):
    """
    Return session_id for a user.
    If the user is new, assign a new session UUID.
    """
    if user_id not in users:
        users[user_id] = str(uuid.uuid4())
        print(f"New user added: {user_id}, session: {users[user_id]}")
    return users[user_id]


In [48]:
from datetime import datetime


In [49]:
def save_chat(user_id, session_id, query, response):
    timestamp = datetime.now()  # <-- use datetime.now() directly
    cur.execute("""
    INSERT INTO chat_history (user_id, session_id, timestamp, query, response)
    VALUES (?, ?, ?, ?, ?)
    """, (user_id, session_id, timestamp, query, response))
    conn.commit()


In [50]:
# Update Chat Functions

#Use session_id to track each user separately. Modify your fetch and save functions
# Save chat per user & session
# def save_chat(user_id, session_id, query, response):
#     timestamp = datetime.datetime.now()
#     cur.execute("""
#     INSERT INTO chat_history (user_id, session_id, timestamp, query, response)
#     VALUES (?, ?, ?, ?, ?)
#     """, (user_id, session_id, timestamp, query, response))
#     conn.commit()

# Fetch last N chats for context per user & session
def fetch_last_n(user_id, session_id, n=10):
    cur.execute("""
    SELECT query, response FROM chat_history
    WHERE user_id=? AND session_id=?
    ORDER BY timestamp DESC LIMIT ?
    """, (user_id, session_id, n))
    rows = cur.fetchall()
    rows.reverse()
    return "\n".join([f"User: {q}\nAsst: {r}" for q,r in rows])


In [54]:
# VECTOR query handler
# General query handler (DuckDuckGo + Groq LLM)
def handle_general_query(query):
    response = ""

    # DuckDuckGo search (for general queries)
    with DDGS() as ddgs:
        results = ddgs.text(query, max_results=3)
        for r in results:
            response += f"Title: {r.get('title','')}\nURL: {r.get('href','')}\nSnippet: {r.get('body','')}\n---\n"

    # Optionally, you can also call Groq LLM for reasoning
    # response_llm = groq.run(prompt=f"Answer the user query: {query}")
    # response += f"\n[Groq LLM]: {response_llm}\n"

    if response.strip() == "":
        response = "Sorry, I don't have information about this topic."

    return response


In [55]:
#Test Multi-User Pipeline
# Example 10+ users
test_queries = [
    "Explain AML process in banking",
    "Energy policy claims",
    "Cyber security risk assessment",
    "Marine engineering standards",
    "Automobile insurance policies",
    "Environmental policy regulations",
    "Pharma compliance work",
    "Technical line liability",
    "Property and casualty claims",
    "Financial fraud in banking",
    "New user query example"
]

for i, q in enumerate(test_queries):
    user_id = f"user_{i+1}"
    print(f"\n--- {user_id} ---")
    resp = handle_query(user_id, q)
    print(resp)



--- user_1 ---


  cur.execute("""


Source: banking_1.txt
LOB: banking
Text: This document belongs to BANKING domain. It explains policies, workflows, risks, compliance, operational challenges and domain-specific terminology.
---
Source: banking_0.txt
LOB: banking
Text: This document belongs to BANKING domain. It explains policies, workflows, risks, compliance, operational challenges and domain-specific terminology.
---
Source: pharma_1.txt
LOB: pharma
Text: This document belongs to PHARMA domain. It explains policies, workflows, risks, compliance, operational challenges and domain-specific terminology.
---


--- user_2 ---
Source: energy_1.txt
LOB: energy
Text: This document belongs to ENERGY domain. It explains policies, workflows, risks, compliance, operational challenges and domain-specific terminology.
---
Source: energy_0.txt
LOB: energy
Text: This document belongs to ENERGY domain. It explains policies, workflows, risks, compliance, operational challenges and domain-specific terminology.
---
Source: insurance_0.tx