# Personal Running Code

## Pipeline

### Scanning documents

In [None]:
# --- PDF Load ---
import time, logging, warnings, contextlib, io
from pathlib import Path
from tqdm import tqdm

# 1) Silence warnings (offsets)
try:
    from pypdf.errors import PdfReadWarning
    warnings.filterwarnings("ignore", category=PdfReadWarning)
except Exception:
    pass

logging.getLogger("pypdf").setLevel(logging.ERROR)

# 2) Prog bar
from langchain.document_loaders import PyPDFLoader

pdf_dir = Path("data")
pdf_paths = sorted(pdf_dir.rglob("*.pdf"))

docs = []
start = time.perf_counter()

print(f"Scanning {len(pdf_paths)} PDFs in {pdf_dir.resolve()} ...")
for p in tqdm(pdf_paths, desc="Loading PDFs", unit="file"):
    try:
        # Clean notebook
        with contextlib.redirect_stderr(io.StringIO()):
            loader = PyPDFLoader(str(p))
            docs.extend(loader.load())
    except Exception as e:
        print(f":warning: Skipped {p.name}: {e}")

elapsed = time.perf_counter() - start
print(f"Done. Loaded {len(docs)} document chunks from {len(pdf_paths)} PDF files in {elapsed:,.1f}s.")

### Pipeline

In [None]:
# --- Split ---
from langchain.text_splitter import RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
split_docs = splitter.split_documents(docs)
print(f"Split Chunks: {len(split_docs)}")

# --- Embeddings --
from langchain_huggingface import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    encode_kwargs={"normalize_embeddings": True}
)

# --- Chroma vector store ---
from langchain_community.vectorstores import Chroma

# in-memory index - split_docs
db = Chroma.from_documents(split_docs, embeddings)   # RAM only
retriever = db.as_retriever(search_kwargs={"k": 5})

print("CChroma Vector Store: Ready")


# --- Retriever ---
retriever = db.as_retriever(search_kwargs={"k": 5})

# --- QA ---
from langchain_huggingface import HuggingFaceEndpoint
from langchain.chains import RetrievalQA

llm = HuggingFaceEndpoint(
    repo_id="google/flan-t5-base",
    temperature=0,
    max_new_tokens=512,
)

qa_chain = RetrievalQA.from_chain_type(
    llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True
)
print("Pipeline: Ready")

## Test Cases

### Question Helper

In [None]:
# Detect input and call qa_chain
def ask(question: str):
    want = list(getattr(qa_chain, "input_keys", []))  # e.g. ['query'] or ['input']
    candidates = (want or []) + ["query", "input"]
    last_err = None
    for k in candidates:
        try:
            return qa_chain.invoke({k: question})
        except Exception as e:
            last_err = e
            continue
    raise RuntimeError(f"Unable to invoke qa_chain. input_keys={want}") from last_err


def print_sources(result, preview=140, max_items=8):
    print("\nSources:")
    seen = set()
    for i, d in enumerate(result.get("source_documents", [])):
        if i >= max_items: break
        src = d.metadata.get("source", "unknown")
        if src in seen: 
            continue
        seen.add(src)
        page = d.metadata.get("page", None)
        page_str = f" (p.{page+1})" if isinstance(page, int) else ""
        snippet = d.page_content.replace("\n", " ").strip()
        print(f"- {src}{page_str} :: {snippet[:preview]}...")
        

# Retrieve only
def list_top_docs(query: str, k: int = 10, preview=120):
    docs = retriever.get_relevant_documents(query)[:k]
    print(f"Top {len(docs)} matches for: {query}\n")
    paths_seen = set()
    rank = 1
    for d in docs:
        src = d.metadata.get("source", "unknown")
        if src in paths_seen: 
            continue
        paths_seen.add(src)
        page = d.metadata.get("page", None)
        page_str = f" (p.{page+1})" if isinstance(page, int) else ""
        snippet = d.page_content.replace("\n", " ").strip()
        print(f"{rank:>2}. {src}{page_str}\n    {snippet[:preview]}...\n")
        rank += 1


### UC1: Find Particular Documents

In [None]:
print("UC1: Find Particular Documents")
q = "Where is the CAPP final report from 2024?"

list_top_docs(q, k=10, preview=150)

res = ask(q)
print("\nResponse:\n", res["result"])
print_sources(res)

### UC2: Summarize Particular Documents

In [None]:
print("UC2: Summarize Particular Documents")
q = "Summarize the CAPP final report from 2024."

res = ask(q)
print("\nSummary:\n", res["result"])
print_sources(res)


### UC3: Find Documents by Contents

In [None]:
print("UC3: Find Documents by Contents")
q = "Find documents related to system executive policies on AI."

list_top_docs(q, k=12, preview=150)

res = ask(q)
print("\nResponse:\n", res["result"])
print_sources(res)


### UC4: Finding Particular Information

In [None]:
print("UC4: Finding Particular Information")
q = "When were votes on AI policies conducted? Provide dates and where they appear."

res = ask(q)

print("\nResponse:\n", res["result"])
print_sources(res)


### UC5: Finding Related Information

In [None]:
print("UC5: Finding Related Information (history)")
q = "Show me the history of resolutions on GE. Include dates, brief summaries, and where they were recorded."

res = ask(q)
print("\nResponse:\n", res["result"])
print_sources(res)


### UC6: Refinement of Found Information

In [None]:
print("UC6: Refinement of Found Information")
q = "In the history of GE resolutions you listed, which items support vs oppose GE reforms? Group them."

res = ask(q)
print("\nResponse:\n", res["result"])
print_sources(res)
