# Personal Running Code

## Pipeline

### Scanning documents

In [None]:
from langchain.document_loaders import DirectoryLoader, PyPDFLoader

loader = DirectoryLoader("data/", glob="**/*.pdf", loader_cls=PyPDFLoader)
documents = loader.load()

### Checking First Document

In [None]:
print(f"Total loaded documents: {len(documents)}")
print("\n--- Preview of first document ---\n")
print(documents[0].page_content[:500])

### Smaller chunks

In [None]:
from langchain.text_splitter import CharacterTextSplitter

# Split into chunks of ~1000 characters with 200-character overlap
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
split_docs = text_splitter.split_documents(documents)

print(f"Total chunks after splitting: {len(split_docs)}")
print("\n--- Preview of first chunk ---\n")
print(split_docs[0].page_content[:500])

### Embeddings

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings()

### Vectors

In [None]:
from langchain.vectorstores import FAISS

db = FAISS.from_documents(split_docs, embeddings)
retriever = db.as_retriever()

## Test Cases

### UC1: Find Particular Documents

In [None]:
query = "Where is the CAPP final report from 2024?"
result = qa_chain({"query": query})

print("\n--- UC1: Find Particular Documents ---\n")
print("Response:", result["result"], "\n")

print("Documents used:")
for doc in result["source_documents"]:
    print(f"{doc.metadata.get('source', 'unknown')} \n{doc.page_content[:200]}...\n")


UC2: Summarize Particular Documents

In [None]:
query = "Summarize the CAPP final report from 2024"
result = qa_chain({"query": query})

print("\n--- UC2: Summarize Particular Documents ---\n")
print("Summary:", result["result"], "\n")

print("Documents used:")
for doc in result["source_documents"]:
    print(f"{doc.metadata.get('source', 'unknown')}\n")


UC3: Find Documents by Contents

In [None]:
query = "Find documents related to system executive policies on AI"
result = qa_chain({"query": query})

print("\n--- UC3: Find Documents by Contents ---\n")
print("Response:", result["result"], "\n")

print("Documents used:")
for doc in result["source_documents"]:
    print(f"{doc.metadata.get('source', 'unknown')}\n")


UC4: Finding Particular Information

In [None]:
query = "When were votes on AI policies conducted?"
result = qa_chain({"query": query})

print("\n--- UC4: Finding Particular Information ---\n")
print("Response:", result["result"], "\n")

print("Documents used:")
for doc in result["source_documents"]:
    print(f"{doc.metadata.get('source', 'unknown')}\n")


UC5: Finding Related Information

In [None]:
query = "Show me the history of resolutions on GE"
result = qa_chain({"query": query})

print("\n--- UC5: Finding Related Information ---\n")
print("Response:", result["result"], "\n")

print("Documents used:")
for doc in result["source_documents"]:
    print(f"{doc.metadata.get('source', 'unknown')}\n")


UC6: Refinement of Found Information

In [None]:
query = "In the history of resolutions you showed me, which ones are supportive or opposing GE reforms?"
result = qa_chain({"query": query})

print("\n--- UC6: Refinement of Found Information ---\n")
print("Response:", result["result"], "\n")

print("Documents used:")
for doc in result["source_documents"]:
    print(f"{doc.metadata.get('source', 'unknown')}\n")
