# Personal Running Code

## Pipeline

### Scanning documents

In [None]:
# --- Quiet, progress-friendly PDF loader ---
import time, logging, warnings, contextlib, io
from pathlib import Path
from tqdm import tqdm

# 1) Silence pypdf chatter/warnings
try:
    from pypdf.errors import PdfReadWarning
    warnings.filterwarnings("ignore", category=PdfReadWarning)
except Exception:
    pass  # older pypdf versions may not expose PdfReadWarning

logging.getLogger("pypdf").setLevel(logging.ERROR)

# 2) Load PDFs with a progress bar (no ipywidgets required)
from langchain.document_loaders import PyPDFLoader

pdf_dir = Path("data")
pdf_paths = sorted(pdf_dir.rglob("*.pdf"))

docs = []
start = time.perf_counter()

print(f"Scanning {len(pdf_paths)} PDFs in {pdf_dir.resolve()} ...")
for p in tqdm(pdf_paths, desc="Loading PDFs", unit="file"):
    try:
        # Some libraries print to stderr; swallow it to keep the notebook clean
        with contextlib.redirect_stderr(io.StringIO()):
            loader = PyPDFLoader(str(p))
            docs.extend(loader.load())
    except Exception as e:
        print(f":warning: Skipped {p.name}: {e}")

elapsed = time.perf_counter() - start
print(f"Done. Loaded {len(docs)} document chunks from {len(pdf_paths)} PDF files in {elapsed:,.1f}s.")

### Pipeline

In [None]:
# --- Split (unchanged) ---
from langchain.text_splitter import RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
split_docs = splitter.split_documents(docs)
print(f"Chunks after splitting: {len(split_docs)}")

# --- Embeddings (new import path) ---
from langchain_huggingface import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    encode_kwargs={"normalize_embeddings": True}  # IP == cosine
)

# --- Batched FAISS builder: ensures contiguous float32 array ---
from pathlib import Path
from uuid import uuid4
from tqdm import tqdm
import numpy as np
import faiss

from langchain.vectorstores import FAISS
from langchain.docstore import InMemoryDocstore

INDEX_DIR = Path("indexes/campus-faiss-batched")

def build_faiss_index_batched(docs, embed, batch_size=1000):
    """Embed in batches and build a FAISS (IP) index from normalized vectors."""
    texts  = [d.page_content for d in docs]
    metas  = [d.metadata     for d in docs]

    all_embeds = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Embedding batches", unit="batch"):
        batch = texts[i:i+batch_size]
        # returns a list of vectors
        batch_embeds = embed.embed_documents(batch)
        all_embeds.extend(batch_embeds)

    # ---> the important part: stack to contiguous float32 array
    embs = np.vstack(all_embeds).astype("float32", copy=False)
    # (optional sanity checks)
    assert embs.ndim == 2 and embs.shape[0] == len(texts), f"Bad shape: {embs.shape}"
    # if you didn't normalize in the embedder, uncomment the next line:
    # faiss.normalize_L2(embs)

    dim = embs.shape[1]
    index = faiss.IndexFlatIP(dim)  # cosine == inner-product on normalized vectors
    print(type(embs), embs.dtype, embs.shape, embs.flags['C_CONTIGUOUS'])
    index.add(embs)                 # must be contiguous float32

    # wrap into LangChain's FAISS store
    ids = [str(uuid4()) for _ in range(len(docs))]
    docstore = InMemoryDocstore({ids[i]: docs[i] for i in range(len(docs))})
    id_map   = {i: ids[i] for i in range(len(docs))}
    db = FAISS(embedding_function=embed, index=index,
               docstore=docstore, index_to_docstore_id=id_map)
    return db

def save_index(db, path: Path):
    path.parent.mkdir(parents=True, exist_ok=True)
    db.save_local(str(path))

def load_or_build_index(split_docs, embeddings, index_dir: Path):
    if index_dir.exists():
        try:
            print(f"Loading existing FAISS index from: {index_dir}")
            return FAISS.load_local(
                str(index_dir),
                embeddings,
                allow_dangerous_deserialization=True
            )
        except Exception as e:
            print(f"⚠️ Failed to load existing index ({e}). Rebuilding...")

    print("No existing index found. Building FAISS index…")
    db = build_faiss_index_batched(split_docs, embeddings, batch_size=1000)
    save_index(db, index_dir)
    print(f"Saved FAISS index to: {index_dir}")
    return db

db = load_or_build_index(split_docs, embeddings, INDEX_DIR)
print("Vector store ready ✅")
print(f"Index contains {db.index.ntotal:,} vectors.")

# --- Retriever (choose one) ---
retriever = db.as_retriever(search_kwargs={"k": 5})
# retriever = db.as_retriever(search_type="mmr",
#                             search_kwargs={"k": 8, "fetch_k": 50, "lambda_mult": 0.5})

# --- QA chain (HF example; swap if needed) ---
from langchain.llms import HuggingFaceHub
from langchain.chains import RetrievalQA

llm = HuggingFaceHub(
    repo_id="google/flan-t5-base",
    model_kwargs={"temperature": 0, "max_length": 512}
)

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True
)

print("Pipeline ready ✅")


### Smaller chunks

In [None]:
from langchain.text_splitter import CharacterTextSplitter

# Split into chunks of ~1000 characters with 200-character overlap
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
split_docs = text_splitter.split_documents(documents)

print(f"Total chunks after splitting: {len(split_docs)}")
print("\n--- Preview of first chunk ---\n")
print(split_docs[0].page_content[:500])

### Embeddings

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings()

### Vectors

In [None]:
from langchain.vectorstores import FAISS

db = FAISS.from_documents(split_docs, embeddings)
retriever = db.as_retriever()

## Test Cases

### UC1: Find Particular Documents

In [None]:
query = "Where is the CAPP final report from 2024?"
result = qa_chain({"query": query})

print("\n--- UC1: Find Particular Documents ---\n")
print("Response:", result["result"], "\n")

print("Documents used:")
for doc in result["source_documents"]:
    print(f"{doc.metadata.get('source', 'unknown')} \n{doc.page_content[:200]}...\n")


UC2: Summarize Particular Documents

In [None]:
query = "Summarize the CAPP final report from 2024"
result = qa_chain({"query": query})

print("\n--- UC2: Summarize Particular Documents ---\n")
print("Summary:", result["result"], "\n")

print("Documents used:")
for doc in result["source_documents"]:
    print(f"{doc.metadata.get('source', 'unknown')}\n")


UC3: Find Documents by Contents

In [None]:
query = "Find documents related to system executive policies on AI"
result = qa_chain({"query": query})

print("\n--- UC3: Find Documents by Contents ---\n")
print("Response:", result["result"], "\n")

print("Documents used:")
for doc in result["source_documents"]:
    print(f"{doc.metadata.get('source', 'unknown')}\n")


UC4: Finding Particular Information

In [None]:
query = "When were votes on AI policies conducted?"
result = qa_chain({"query": query})

print("\n--- UC4: Finding Particular Information ---\n")
print("Response:", result["result"], "\n")

print("Documents used:")
for doc in result["source_documents"]:
    print(f"{doc.metadata.get('source', 'unknown')}\n")


UC5: Finding Related Information

In [None]:
query = "Show me the history of resolutions on GE"
result = qa_chain({"query": query})

print("\n--- UC5: Finding Related Information ---\n")
print("Response:", result["result"], "\n")

print("Documents used:")
for doc in result["source_documents"]:
    print(f"{doc.metadata.get('source', 'unknown')}\n")


UC6: Refinement of Found Information

In [None]:
query = "In the history of resolutions you showed me, which ones are supportive or opposing GE reforms?"
result = qa_chain({"query": query})

print("\n--- UC6: Refinement of Found Information ---\n")
print("Response:", result["result"], "\n")

print("Documents used:")
for doc in result["source_documents"]:
    print(f"{doc.metadata.get('source', 'unknown')}\n")
