#### Reading PDFs from IRS site, converting to md, chunking & storing vectors

In [None]:
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
from io import BytesIO
import fitz  # PyMuPDF
import os

IRS_URL = "https://www.irs.gov/instructions"

try:
    response = requests.get(IRS_URL, timeout=10)
    response.raise_for_status()  # Raises an error for non-200 responses
except requests.RequestException as e:
    raise Exception(f"Failed to fetch IRS instructions page: {e}")

soup = BeautifulSoup(response.text, "html.parser")
pdf_texts = []

for link in soup.find_all("a", href=True):
    href = link["href"]
    if href.lower().endswith(".pdf"):
        pdf_url = urljoin(IRS_URL, href)
        print(f"Processing PDF: {pdf_url}")

        try:
            pdf_response = requests.get(pdf_url, timeout=10)
            pdf_response.raise_for_status()
        except requests.RequestException as e:
            print(f"❌ Failed to fetch {pdf_url}: {e}")
            continue

        pdf_bytes = BytesIO(pdf_response.content)
        try:
            # Using PyMuPDF (fitz) to extract text as an alternative to pdfminer
            doc = fitz.open(stream=pdf_bytes, filetype="pdf")
            text = ""
            for page in doc:
                text += page.get_text()
            pdf_texts.append({"file_name": pdf_url.split("/")[-1], "content": text})
        except Exception as e:
            print(f"❌ Failed to process {pdf_url}: {e}")

print(f"✅ Processed {len(pdf_texts)} PDFs")


Processing PDF: https://www.irs.gov/pub/irs-pdf/i1040gi.pdf
Processing PDF: https://www.irs.gov/pub/irs-pdf/pcir230.pdf
Processing PDF: https://www.irs.gov/pub/irs-pdf/i1040gi.pdf
Processing PDF: https://www.irs.gov/pub/irs-pdf/pcir230.pdf
Processing PDF: https://www.irs.gov/pub/irs-pdf/i56.pdf
Processing PDF: https://www.irs.gov/pub/irs-pdf/i172.pdf
Processing PDF: https://www.irs.gov/pub/irs-pdf/i461.pdf
Processing PDF: https://www.irs.gov/pub/irs-pdf/i706.pdf
Processing PDF: https://www.irs.gov/pub/irs-pdf/i706a.pdf
Processing PDF: https://www.irs.gov/pub/irs-pdf/i706d.pdf
Processing PDF: https://www.irs.gov/pub/irs-pdf/i706gsd.pdf
Processing PDF: https://www.irs.gov/pub/irs-pdf/i706gsd1.pdf
Processing PDF: https://www.irs.gov/pub/irs-pdf/i706gst.pdf
Processing PDF: https://www.irs.gov/pub/irs-pdf/i706na.pdf
Processing PDF: https://www.irs.gov/pub/irs-pdf/i706qdt.pdf
Processing PDF: https://www.irs.gov/pub/irs-pdf/i709.pdf
Processing PDF: https://www.irs.gov/pub/irs-pdf/i720.pdf
Pro

In [24]:
# TODO 1: Make this work with the user inputted PDF

from markdownify import markdownify as md

# Convert extracted text to Markdown (In-Memory)
pdf_markdown = [
    {"file_name": doc["file_name"].replace(".pdf", ".md"), "content": md(doc["content"])}
    for doc in pdf_texts
]

print(f"✅ Converted {len(pdf_markdown)} PDFs to Markdown format")

✅ Converted 409 PDFs to Markdown format


In [25]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Initialize chunking
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)

# Chunk the Markdown documents
chunks = []
for doc in pdf_markdown:
    split_texts = text_splitter.split_text(doc["content"])
    for i, text in enumerate(split_texts):
        chunks.append({"file_name": doc["file_name"], "chunk_id": i, "content": text})

print(f"✅ Created {len(chunks)} text chunks")

✅ Created 68418 text chunks


In [26]:
import pinecone

# Initialize Pinecone
pc = pinecone.Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

# Set index name
index_name = "tax-rag"

# Check if the index already exists
existing_indexes = [index.name for index in pc.list_indexes()]
if index_name not in existing_indexes:
    print(f"Creating new Pinecone index: {index_name}")
    pc.create_index(
        name=index_name,
        spec=pinecone.ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        ),
        dimension=384,
        metric="cosine"
    )

# Connect to the existing index
index = pc.Index(index_name)
print(f"✅ Connected to Pinecone index: {index_name}")

Creating new Pinecone index: tax-rag
✅ Connected to Pinecone index: tax-rag


In [None]:
from sentence_transformers import SentenceTransformer

# Initialize embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Convert each chunk into an embedding
chunk_embeddings = [
    {
        "file_name": chunk["file_name"],
        "chunk_id": chunk["chunk_id"],
        "content": chunk["content"],
        "embedding": embedding_model.encode(chunk["content"]).tolist()
    }
    for chunk in chunks
]

# Store in Pinecone with batching
def batch_upsert(index, vectors, batch_size=100):
    """Uploads embeddings in batches to avoid Pinecone's request size limit."""
    for i in range(0, len(vectors), batch_size):
        batch = vectors[i : i + batch_size]  # Get batch slice
        index.upsert(batch)
        print(f"✅ Uploaded batch {i // batch_size + 1}/{(len(vectors) // batch_size) + 1}")

# Prepare vectors for Pinecone
vectors = [
    (f"{chunk['file_name']}_{chunk['chunk_id']}", chunk["embedding"], {"text": chunk["content"]})
    for chunk in chunk_embeddings
]

# Upload in batches
batch_upsert(index, vectors)

print("✅ All embeddings successfully stored in Pinecone!")

The cell executed successfully, but we accidentally deleted the output during the cleanup process. The complete successful run took two hours.

In [None]:
# Test Query

query = "Which forms do I need for self-employment?"
query_embedding = embedding_model.encode(query).tolist()

results = index.query(vector=query_embedding, top_k=5, include_metadata=True)

# Print top results
for match in results["matches"]:
    print(f"🔹 Score: {match['score']}")
    print(f"📄 Text: {match['metadata']['text']}\n")

🔹 Score: 0.604735732
📄 Text: self-employment income from separate nonfarm or farm
businesses, each of you must complete and file a
separate Schedule C (Form 1040) or Schedule F (Form
1040). Be sure to enter at the top of each Schedule C
(Form 1040) or Schedule F (Form 1040) the name and
SSN of the spouse who owns the business. Each of you
must also complete a separate Schedule SE (Form 1040).
Attach these pages to a single Form 1040-SS.
Business Owned and Operated by
Spouses

🔹 Score: 0.598732114
📄 Text: Schedule SE (Form 1040), Self-Employment Tax, to complete your return.
You may only need to file Form 1040-SS and none of the schedules. However, if your return is more complicated (for
example, you claim certain deductions or credits or owe additional taxes), you will need to complete one or more of the
schedules. Below is a general guide to which schedule(s) you will need to file based on your circumstances. See the

🔹 Score: 0.588199079
📄 Text: General Instructions
Purpose of Form
T