In [44]:
from dotenv import load_dotenv
import os
import uuid
import requests
from pypdf import PdfReader
from supabase import create_client
from openai import OpenAI
from datetime import datetime
import math
from supabase import create_client

load_dotenv()

NVIDIA_API_KEY = os.getenv("NVIDIA_API_KEY")
SUPABASE_URL = "https://ctublgctoyuwuxwanujg.supabase.co"
SUPABASE_KEY = os.getenv("SUPABASE_KEY")

BATCH_SIZE = 10
PDF_URL = "https://www.cdfifund.gov/system/files/documents/sample-organization-statement-of-financial-income-and-expense.pdf"

CHUNK_SIZE = 800
CHUNK_OVERLAP = 150
BATCH_SIZE = 50  # insert in batches


In [58]:
supabase = create_client(SUPABASE_URL, SUPABASE_KEY)
client = OpenAI(
  api_key=NVIDIA_API_KEY,
  base_url="https://integrate.api.nvidia.com/v1"
)


In [59]:
print("Downloading PDF...")
response = requests.get(PDF_URL)
with open("document.pdf", "wb") as f:
    f.write(response.content)


Downloading PDF...


In [49]:
print("Extracting text...")
reader = PdfReader("document.pdf")
pages = []

for i, page in enumerate(reader.pages):
    text = page.extract_text()
    if text:
        pages.append((i + 1, text))



Extracting text...


In [50]:
def chunk_text(text, chunk_size=800, overlap=150):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunks.append(text[start:end])
        start += chunk_size - overlap
    return chunks

all_chunks = []

for page_number, page_text in pages:
    chunks = chunk_text(page_text, CHUNK_SIZE, CHUNK_OVERLAP)
    for chunk in chunks:
        all_chunks.append({
            "content": chunk,
            "page": page_number
        })

print(f"Total chunks: {len(all_chunks)}")


Total chunks: 9


In [54]:
def embed_batch(texts):
    print(f"Embedding batch of {len(texts)} chunks...")
    # skip if text is empty
    texts = [t for t in texts if t.strip()]
    response = client.embeddings.create(
        model="baai/bge-m3",
        input=texts,
        encoding_format="float",
        extra_body={"truncate": "NONE"}
    )
    return [d.embedding for d in response.data]


In [56]:
for i in range(0, len(all_chunks), BATCH_SIZE):

    batch = all_chunks[i:i+BATCH_SIZE]
    texts = [item["content"] for item in batch]

    embeddings = embed_batch(texts)

    rows = []
    for item, embedding in zip(batch, embeddings):
        rows.append({
            "content": item["content"],
            "embedding": embedding,
            "metadata": {
                "source": "cdfi_statement",
                "page": item["page"],
                "ingested_at": datetime.utcnow().isoformat()
            }
        })

    supabase.table("documents").insert(rows).execute()

    print(f"Inserted batch {i // BATCH_SIZE + 1}")


Embedding batch of 9 chunks...


  "ingested_at": datetime.utcnow().isoformat()


Inserted batch 1


In [60]:
# Delete the local PDF file after processing
os.remove("document.pdf")

In [62]:
# Test query
query = "What is the total revenue for the organization?"
response = client.embeddings.create(
    model="baai/bge-m3",
    input=[query],
    encoding_format="float",
    extra_body={"truncate": "NONE"}
)
query_embedding = response.data[0].embedding

supabase_response = supabase.rpc("match_documents", {
    "query_embedding": query_embedding,
    "match_count": 5
}).execute()

In [None]:
supabase_response

SingleAPIResponse(data=[{'id': 1, 'content': '     \n   \n   \n   \n   \n      \n    \n   \n  \n      \n   \n   \n   \n    \n   \n   \n           \n  \n   \n   \n    \n    \n \n \n  \n   \n       \n   \n         \n   \n  \n             \n    \n          \n     \n   \n    \n      \n    \n  \n    \n Sample Organization \nStatement of Financial Income and Expense \nAccrual Basis January through December 2018 \nJan - Dec 18 Budget $ Over Budget % of Budget \nIncome \n40000 · Raised Income \n40500 · Individual Contributions 205,182.50 169,775.76 35,406.74 120.9% \n41000 · Corporate Contributions \n41100 · Corporate Grants 248,131.70 173,360.34 74,771.36 143.1% \n41200 · Corporate Sponsorships 22,839.88 17,739.88 5,100.00 128.7% \n41300 · Corporate Match 7,192.25 5,837.27 1,354.98 123.2% \nTotal 41000 · Corporate Contributions 278,163.83 196,937.49 81,226.34 141.2% \n42000 · Foundation Contri', 'metadata': {'page': 1, 'source': 'cdfi_statement', 'ingested_at': '2026-02-17T23:05:07.894609'}, 