In [1]:
# ================================
# CELL 1 — INSTALL DEPENDENCIES (RUN FIRST)
# ================================
!pip install -U pinecone sentence-transformers pypdf groq ddgs

# After running this cell:
# Runtime -> Restart runtime (MANDATORY in Colab)


# ================================
# CELL 2 — IMPORTS
# ================================
import os
import uuid
import sqlite3
from typing import List, Dict
from pypdf import PdfReader
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec
from groq import Groq
from ddgs import DDGS


# ================================
# CELL 3 — API KEYS (SET YOUR KEYS HERE)
# ================================
# In Colab, you can also use: from google.colab import userdata
# PINECONE_API_KEY = userdata.get('PINECONE_API_KEY')

PINECONE_API_KEY = " "
GROQ_API_KEY = " "

os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["GROQ_API_KEY"] = GROQ_API_KEY


# ================================
# CELL 4 — INITIALIZE MODELS
# ================================
embedding_model = SentenceTransformer("all-mpnet-base-v2")

groq_client = Groq(api_key=GROQ_API_KEY)


# ================================
# CELL 5 — PINECONE INITIALIZATION
# ================================
pc = Pinecone(api_key=PINECONE_API_KEY)

INDEX_NAME = "lob-knowledge-index"

if INDEX_NAME not in [i.name for i in pc.list_indexes()]:
    pc.create_index(
        name=INDEX_NAME,
        dimension=768,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )

index = pc.Index(INDEX_NAME)


# ================================
# CELL 6 — CREATE SAMPLE PDF DATA (LOB FOLDERS)
# ================================
import os
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas

BASE_DIR = "lob_documents"
LOBS = [
    "banking",
    "automobile",
    "energy",
    "pharma",
    "cybersecurity",
    "environment",
    "entertainment"
]

os.makedirs(BASE_DIR, exist_ok=True)

sample_text = {
    "banking": "AML, KYC, transaction monitoring, policy number BNK-10234.",
    "automobile": "Auto liability, collision coverage, policy AUTO-55678.",
    "energy": "Energy risk, drilling liability, policy ENG-88421.",
    "pharma": "Clinical trials, drug liability, policy PHR-77412.",
    "cybersecurity": "Data breach insurance, SOC compliance, policy CYB-99811.",
    "environment": "Environmental risk, pollution liability, ENV-22345.",
    "entertainment": "Media liability, IP protection, policy ENT-66554."
}

for lob in LOBS:
    folder = os.path.join(BASE_DIR, lob)
    os.makedirs(folder, exist_ok=True)
    file_path = os.path.join(folder, f"{lob}_policy.pdf")
    c = canvas.Canvas(file_path, pagesize=letter)
    text = c.beginText(40, 750)
    for _ in range(30):
        text.textLine(sample_text[lob])
    c.drawText(text)
    c.save()


# ================================
# CELL 7 — PDF LOADER
# ================================
def load_pdfs(base_dir: str):
    docs = []
    for lob in os.listdir(base_dir):
        lob_path = os.path.join(base_dir, lob)
        for file in os.listdir(lob_path):
            reader = PdfReader(os.path.join(lob_path, file))
            text = "".join([p.extract_text() for p in reader.pages])
            docs.append({"lob": lob, "text": text})
    return docs


documents = load_pdfs(BASE_DIR)


# ================================
# CELL 8 — CHUNKING STRATEGY
# ================================
def chunk_text(text, chunk_size=400, overlap=50):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunks.append(text[start:end])
        start = end - overlap
    return chunks


# ================================
# CELL 9 — EMBEDDING + UPSERT
# ================================
vectors = []

for doc in documents:
    chunks = chunk_text(doc["text"])
    for i, chunk in enumerate(chunks):
        vector_id = f"{doc['lob']}-{uuid.uuid4()}"
        embedding = embedding_model.encode(chunk).tolist()
        metadata = {
            "lob": doc["lob"],
            "chunk_index": i,
            "text": chunk
        }
        vectors.append((vector_id, embedding, metadata))

index.upsert(vectors=vectors)


# ================================
# CELL 10 — SQLITE FOR MULTI-USER MEMORY
# ================================
conn = sqlite3.connect("chat_memory.db")
cursor = conn.cursor()

cursor.execute("""
CREATE TABLE IF NOT EXISTS chat_history (
    user_id TEXT,
    query TEXT,
    response TEXT
)
""")
conn.commit()


# ================================
# CELL 11 — QUERY ROUTER AGENT
# ================================
def route_query(query: str):
    for lob in LOBS:
        if lob in query.lower():
            return "rag", lob
    return "general", None


# ================================
# CELL 12 — RETRIEVAL FUNCTION
# ================================
def retrieve_context(query, lob):
    q_embedding = embedding_model.encode(query).tolist()
    results = index.query(
        vector=q_embedding,
        top_k=5,
        include_metadata=True,
        filter={"lob": lob}
    )
    return " ".join([m["metadata"]["text"] for m in results["matches"]])


# ================================
# CELL 13 — GROQ LLM CALL
# ================================
def groq_llm(prompt):
    response = groq_client.chat.completions.create(
        model="llama-3.1-8b-instant",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.2
    )
    return response.choices[0].message.content


# ================================
# CELL 14 — GENERAL SEARCH TOOL
# ================================
def general_search(query):
    with DDGS() as ddgs:
        results = ddgs.text(query, max_results=3)
    return results


# ================================
# CELL 15 — MAIN QUERY HANDLER (MULTI USER)
# ================================
def handle_query(user_id, query):
    route, lob = route_query(query)

    if route == "rag":
        context = retrieve_context(query, lob)
        prompt = f"""
        You are a policy assistant.
        Answer ONLY from the context.
        Do not reveal sensitive or personal data.

        Context:
        {context}

        Question:
        {query}
        """
        response = groq_llm(prompt)
    else:
        search_results = general_search(query)
        response = "\n".join([r["body"] for r in search_results])

    cursor.execute(
        "INSERT INTO chat_history VALUES (?, ?, ?)",
        (user_id, query, response)
    )
    conn.commit()
    return response







The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [2]:
# ================================
# CELL 16 — TEST MULTI USER
# ================================
print(handle_query("user_1", "Explain AML process in banking"))
print(handle_query("user_2", "What is cybersecurity insurance?"))
print(handle_query("user_1", "Explain automobile liability"))


# ================================
# CELL 17 — VIEW CHAT HISTORY
# ================================
cursor.execute("SELECT * FROM chat_history")
rows = cursor.fetchall()
for r in rows:
    print(r)


# ================================
# END OF NOTEBOOK
# ================================

Based on the provided context, I will explain the AML (Anti-Money Laundering) process in banking, specifically in relation to policy number BNK-10234.

**AML Process in Banking:**

The AML process in banking involves a series of steps to prevent and detect money laundering activities. The process is designed to identify and report suspicious transactions, as well as to verify the identity of customers. Here's an overview of the AML process in banking:

1. **Customer Onboarding**: When a new customer opens an account, the bank is required to collect and verify the customer's identity through Know Your Customer (KYC) procedures. This includes collecting identification documents, such as a passport or driver's license, and verifying the customer's address.
2. **Transaction Monitoring**: The bank monitors all transactions, including deposits, withdrawals, and transfers, to identify suspicious activity. This includes monitoring for unusual patterns, such as large transactions or transaction

In [3]:
# ================================
# CELL 18 — TEST WITH 5 USERS
# ================================

users = [
    ("user_1", "Explain AML process in banking"),
    ("user_2", "What is cybersecurity insurance?"),
    ("user_3", "Explain energy sector liability"),
    ("user_4", "What is automobile collision coverage?"),
    ("user_5", "What is KYC and why it is important?")
]

for user_id, query in users:
    print(f"\n--- {user_id} ---")
    response = handle_query(user_id, query)
    print(response)



--- user_1 ---
Based on the provided context, it appears that the policy number BNK-10234 is related to Anti-Money Laundering (AML) and Know Your Customer (KYC) regulations in banking. Here's an overview of the AML process in banking:

**1. Customer Onboarding:**
The bank is required to implement a robust KYC process to verify the identity of customers, including their name, address, date of birth, and other relevant information. This is typically done through documentation, such as government-issued ID, proof of address, and other supporting documents.

**2. Risk Assessment:**
The bank assesses the customer's risk profile based on various factors, including their business activity, geographic location, and transaction history. This helps the bank to identify high-risk customers who may require more stringent monitoring.

**3. Transaction Monitoring:**
The bank implements a transaction monitoring system to detect and report suspicious transactions. This system analyzes customer transa

In [4]:
# ================================
# CELL 19 — VERIFY USER ISOLATION
# ================================

def show_user_history(user_id):
    cursor.execute(
        "SELECT query, response FROM chat_history WHERE user_id = ?",
        (user_id,)
    )
    rows = cursor.fetchall()
    print(f"\nChat history for {user_id}")
    for q, r in rows:
        print(f"Q: {q}")
        print(f"A: {r}\n")

show_user_history("user_1")
show_user_history("user_3")



Chat history for user_1
Q: Explain AML process in banking
A: Based on the provided context, I will explain the AML (Anti-Money Laundering) process in banking, specifically in relation to policy number BNK-10234.

**AML Process in Banking:**

The AML process in banking involves a series of steps to prevent and detect money laundering activities. The process is designed to identify and report suspicious transactions, as well as to verify the identity of customers. Here's an overview of the AML process in banking:

1. **Customer Onboarding**: When a new customer opens an account, the bank is required to collect and verify the customer's identity through Know Your Customer (KYC) procedures. This includes collecting identification documents, such as a passport or driver's license, and verifying the customer's address.
2. **Transaction Monitoring**: The bank monitors all transactions, including deposits, withdrawals, and transfers, to identify suspicious activity. This includes monitoring f