In [None]:
# ======================================
# Step 1: Install Dependencies
# ======================================
%pip install numpy boto3 chromadb langchain langchain-community langchain-aws langchain-text-splitters sentence-transformers nltk

# ======================================
# Step 2: Configuration & Variables
# ======================================
import os
import chromadb
from chromadb.config import Settings

# --- AWS Configuration ---
# PLEASE REPLACE WITH YOUR ACTUAL CREDENTIALS
AWS_ACCESS_KEY_ID = "XXX"
AWS_SECRET_ACCESS_KEY = "XXX"
AWS_REGION = "us-west-2"

# --- Bedrock Model Configuration ---
# Using a stable Claude 3 Sonnet ID which is widely available in us-west-2
BEDROCK_MODEL_ID = "anthropic.claude-3-sonnet-20240229-v1:0"

# --- ChromaDB Cloud Configuration ---
# Sign up at https://trychroma.com to get your API Token
CHROMA_API_KEY = "ck-XXX"
CHROMA_TENANT = "default_tenant"  # Usually 'default_tenant' for most users
CHROMA_DATABASE = "db_1" # Usually 'default_database'
CHROMA_COLLECTION_NAME = "rag_collection"

# Apply Environment Variables for Boto3
os.environ["AWS_ACCESS_KEY_ID"] = AWS_ACCESS_KEY_ID
os.environ["AWS_SECRET_ACCESS_KEY"] = AWS_SECRET_ACCESS_KEY
os.environ["AWS_DEFAULT_REGION"] = AWS_REGION

print("Configuration Loaded.")

# Initialize AWS & ChromaDB
import boto3

# AWS Bedrock Client
session = boto3.Session(
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
    region_name=AWS_REGION
)
bedrock_client = session.client("bedrock-runtime")
print("Bedrock Client Initialized successfully.")

# ChromaDB Client
chroma_client = chromadb.HttpClient(host="localhost", port=8000)
collection = chroma_client.get_or_create_collection(name=CHROMA_COLLECTION_NAME)
print(f"Connected to Chroma. Collection '{CHROMA_COLLECTION_NAME}' ready. Count: {collection.count()}")

# Directories
SOURCE_DIR = "files"
CHUNKED_DIR = os.path.join(SOURCE_DIR, "chunked")
os.makedirs(CHUNKED_DIR, exist_ok=True)

source_files = [
    f for f in os.listdir(SOURCE_DIR)
    if os.path.isfile(os.path.join(SOURCE_DIR, f)) and not f.startswith('.')
]
print(f"Found {len(source_files)} files: {source_files[:5]}")

# Semantic Chunking
from sentence_transformers import SentenceTransformer, util
import re

semantic_model = SentenceTransformer("all-MiniLM-L6-v2")

def split_sentences(text):
    return re.split(r'(?<=[.!?])\s+', text)

def semantic_chunk(text, max_tokens=350, sim_threshold=0.63):
    sentences = split_sentences(text)
    chunks = []
    current = []

    for s in sentences:
        if not current:
            current.append(s)
            continue
        
        emb1 = semantic_model.encode(" ".join(current), convert_to_numpy=False, normalize_embeddings=True)
        emb2 = semantic_model.encode(s, convert_to_numpy=False, normalize_embeddings=True)
        sim = util.cos_sim(emb1, emb2).item()

        if sim > sim_threshold and len(" ".join(current + [s]).split()) < max_tokens:
            current.append(s)
        else:
            chunks.append(" ".join(current))
            current = [s]

    if current:
        chunks.append(" ".join(current))

    return chunks

total_chunks_processed = 0
for file_name in source_files:
    file_path = os.path.join(SOURCE_DIR, file_name)
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()

    chunks = semantic_chunk(text)
    base_name = os.path.splitext(file_name)[0]

    for i, chunk in enumerate(chunks):
        fname = f"ch{i+1}-{base_name}-len{len(chunk)}.txt"
        with open(os.path.join(CHUNKED_DIR, fname), 'w', encoding='utf-8') as out:
            out.write(chunk)

    total_chunks_processed += len(chunks)
    print(f"{file_name}: {len(chunks)} semantic chunks")

print(f"Total Semantic Chunks Processed: {total_chunks_processed}")

# Load Chunks + Prepare Embeddings
import uuid

chunked_files = [f for f in os.listdir(CHUNKED_DIR) if f.endswith('.txt')]
documents, metadatas, ids = [], [], []

for file_name in chunked_files:
    file_path = os.path.join(CHUNKED_DIR, file_name)
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()

    name_no_ext = os.path.splitext(file_name)[0]
    parts = name_no_ext.split('-')

    try:
        chunk_part = int(parts[0].replace('ch',''))
        size = int(parts[-1].replace('len',''))
        original_filename = "-".join(parts[1:-1])
        meta = {"source": file_name, "file_name": original_filename, "chunk": chunk_part, "size": size}
    except:
        meta = {"source": file_name}

    documents.append(content)
    metadatas.append(meta)
    ids.append(str(uuid.uuid4()))

print(f"Prepared {len(documents)} chunks for embedding.")

# Upsert into Chroma
BATCH_SIZE = 100
for i in range(0, len(documents), BATCH_SIZE):
    collection.add(
        documents=documents[i:i+BATCH_SIZE],
        metadatas=metadatas[i:i+BATCH_SIZE],
        ids=ids[i:i+BATCH_SIZE]
    )
    print(f"Upserted batch {i} → {min(i+BATCH_SIZE, len(documents))}")

print(f"ChromaDB upsert complete. Final count: {collection.count()}")

# Initialize Bedrock LLM
from langchain_aws import ChatBedrock

llm = ChatBedrock(
    model_id=BEDROCK_MODEL_ID,
    client=bedrock_client,
    model_kwargs={"max_tokens": 400, "temperature": 0}
)


# Two-Stage Retrieval
def retrieve_semantic(query, n_results=12, threshold=1.2):
    """Step 1: Semantic vector retrieval"""
    results = collection.query(
        query_texts=[query],
        n_results=n_results,
        include=["documents", "metadatas", "distances"]
    )

    candidate_chunks = []
    for text, meta, dist in zip(
        results["documents"][0],
        results["metadatas"][0],
        results["distances"][0]
    ):
        if dist <= threshold:
            candidate_chunks.append({
                "text": text,
                "source": meta.get("source", "unknown"),
                "dist": round(dist, 3)
            })
    return candidate_chunks

def rerank_chunks(query, candidate_chunks, top_k=6):
    # First LLM layer reranks retrieved chunks by relevance
    if not candidate_chunks:
        return []

    context_text = "\n\n".join(f"[{c['source']}] {c['text']}" for c in candidate_chunks)
    prompt = f"""
You are an AI assistant. Given the query, rank the following document chunks by relevance (high to low).
Query: {query}

Chunks:
{context_text}

Return a JSON array of the top {top_k} chunk indices (0-based) most relevant.
"""
    response = llm.invoke(prompt)
    try:
        import json
        top_indices = json.loads(response.content)
    except:
        top_indices = list(range(min(top_k, len(candidate_chunks))))  # fallback

    return [candidate_chunks[i] for i in top_indices]

def retrieve_semantic_two_stage(query, n_results=12, threshold=1.2, top_k=6):
    chunks = retrieve_semantic(query, n_results=n_results, threshold=threshold)
    top_chunks = rerank_chunks(query, chunks, top_k=top_k)
    return top_chunks

# Generate Final Answer
def generate_answer_two_stage(query):
    top_chunks = retrieve_semantic_two_stage(query)
    if not top_chunks:
        return "No relevant info in the knowledge base."

    context = "\n\n---\n\n".join(f"[{d['source']}] (dist={d['dist']})\n{d['text']}" for d in top_chunks)

    prompt = f"""
Use the context to answer the question.
Cite sources inline like [filename].
If unknown, say 'I don't know'.

Context:
{context}

Question:
{query}

Answer:
"""
    response = llm.invoke(prompt)
    return response.content

#Test Query
q = "What is the policy regarding drug usage?"
print("Query:", q)
print("Answer:", generate_answer_two_stage(q))


Collecting requests<3.0.0,>=2.32.5 (from langchain-community)
  Obtaining dependency information for requests<3.0.0,>=2.32.5 from https://files.pythonhosted.org/packages/1e/db/4254e3eabe8020b458f1a747140d32277ec7a271daf1d235b70dc0b4e6e3/requests-2.32.5-py3-none-any.whl.metadata
  Using cached requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting boto3
  Obtaining dependency information for boto3 from https://files.pythonhosted.org/packages/14/79/012734f4e510b0a6beec2a3d5f437b3e8ef52174b1d38b1d5fdc542316d7/boto3-1.42.25-py3-none-any.whl.metadata
  Downloading boto3-1.42.25-py3-none-any.whl.metadata (6.8 kB)
Collecting botocore<1.43.0,>=1.42.25 (from boto3)
  Obtaining dependency information for botocore<1.43.0,>=1.42.25 from https://files.pythonhosted.org/packages/1e/b0/61e3e61d437c8c73f0821ce8a8e2594edfc1f423e354c38fa56396a4e4ca/botocore-1.42.25-py3-none-any.whl.metadata
  Downloading botocore-1.42.25-py3-none-any.whl.metadata (5.9 kB)
Collecting s3transfer<0.17.0,>=0.16.0 (fro

  from .autonotebook import tqdm as notebook_tqdm


jury_duty_and_subpoenas_policy.txt: 9 semantic chunks
endowment_spending_policy.txt: 11 semantic chunks
policy_on_pregnancy_childbirth_lactation_and_related_conditions_faculty_and_staff1.txt: 25 semantic chunks
password_policy.txt: 43 semantic chunks
policy_on_provision_of_financial_resources_to_students.txt: 48 semantic chunks
course_level_policy.txt: 18 semantic chunks
bereavement_leave_policy.txt: 14 semantic chunks
policy_for_events_with_alcohol_on_campus.txt: 131 semantic chunks
alcohol_and_drug_policy.txt: 207 semantic chunks
policy_on_space_allocation_and_facilities_resources.txt: 95 semantic chunks
multiple_donor_gifts_policy.txt: 14 semantic chunks
general_data_privacy_regulation_notice.txt: 37 semantic chunks
policy_for_employment_of_out_of_state_residents.txt: 57 semantic chunks
non-retaliation_policy.txt: 11 semantic chunks
office_assignment_policy.txt: 42 semantic chunks
official_university_communications_policy.txt: 7 semantic chunks
placement_of_student_art_installations