In [None]:
# pip install "langchain>=0.2" langchain-community langchain-openai psycopg[binary] pdfplumber

import os
from pathlib import Path

from langchain_community.document_loaders import PDFPlumberLoader
#from langchain_text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import PGVector

# --- config ---
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY", "<your-openai-key>")
CONNECTION_STRING = os.getenv("PG_CONN", "postgresql+psycopg://user:pass@localhost:5432/yourdb")
COLLECTION = "rag_with_pdfplumber"

pdf_path = "path/to/your.pdf"
doc_name = Path(pdf_path).name

# --- 1) load with pdfplumber ---
loader = PDFPlumberLoader(pdf_path)
pages = loader.load()  # one Document per page

# normalize metadata (keep page & add document_name/source)
for d in pages:
    d.metadata = {
        **(d.metadata or {}),
        "document_name": doc_name,
        "source": doc_name,                  # common field used by many UIs
        "page": d.metadata.get("page", None) # PDFPlumberLoader provides page
    }

# --- 2) chunk ---
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
chunks = splitter.split_documents(pages)

# --- 3) embed + store in Postgres/pgvector ---
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")  # cost-friendly
# create collection/table if needed and insert docs
vs = PGVector.from_documents(
    documents=chunks,
    embedding=embeddings,
    collection_name=COLLECTION,
    connection_string=CONNECTION_STRING,
)

print(f"Ingested {len(chunks)} chunks from {doc_name} into '{COLLECTION}'")

# --- 4) retriever ---
retriever = vs.as_retriever(search_type="similarity", search_kwargs={"k": 5})

# --- 5) test query ---
query = "Summarize the document's data retention policy."
hits = retriever.get_relevant_documents(query)
for i, h in enumerate(hits, 1):
    print(f"\n=== RESULT {i} ===")
    print(h.page_content[:400], "...")
    print("metadata:", h.metadata)  # includes {"document_name": ..., "page": ..., "source": ...}

# (Optional) LLM QA:
# from langchain_openai import ChatOpenAI
# from langchain.chains import RetrievalQA
# llm = ChatOpenAI(model="gpt-4o-mini")
# qa = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, chain_type="stuff")
# print(qa.run(query))
