In [None]:
# Manually setting keys while env is acting up - REMOVE BEFORE PUSH 
import os

#### Reading PDFs from IRS site, converting to md, chunking & storing vectors

In [17]:
import requests
from bs4 import BeautifulSoup
from pdfminer.high_level import extract_text
from io import BytesIO
from urllib.parse import urljoin

# Defining IRS base URL
IRS_URL = "https://www.irs.gov/instructions"

# Fetching IRS instructions page
response = requests.get(IRS_URL)
if response.status_code != 200:
    raise Exception(f"Failed to fetch IRS instructions page: {response.status_code}")

# Parse HTML
soup = BeautifulSoup(response.text, "html.parser")

# Find all PDF links
pdf_texts = []
for link in soup.find_all("a", href=True):
    href = link["href"]
    if href.lower().endswith(".pdf"):  # Ensure it's a PDF link
        pdf_url = urljoin(IRS_URL, href)
        print(f"Processing PDF: {pdf_url}")

        # Streaming PDF instead of downloading
        pdf_response = requests.get(pdf_url, stream=True)
        if pdf_response.status_code == 200:
            pdf_bytes = BytesIO(pdf_response.content)  # Converting to a file-like object
            text = extract_text(pdf_bytes)  # Extracting text directly
            pdf_texts.append({"file_name": pdf_url.split("/")[-1], "content": text})

print(f"âœ… Processed {len(pdf_texts)} PDFs without storing them locally")

Processing PDF: https://www.irs.gov/pub/irs-pdf/i1040gi.pdf
Processing PDF: https://www.irs.gov/pub/irs-pdf/pcir230.pdf
Processing PDF: https://www.irs.gov/pub/irs-pdf/i1040gi.pdf
Processing PDF: https://www.irs.gov/pub/irs-pdf/pcir230.pdf
Processing PDF: https://www.irs.gov/pub/irs-pdf/i56.pdf
Processing PDF: https://www.irs.gov/pub/irs-pdf/i172.pdf
Processing PDF: https://www.irs.gov/pub/irs-pdf/i461.pdf
Processing PDF: https://www.irs.gov/pub/irs-pdf/i706.pdf
Processing PDF: https://www.irs.gov/pub/irs-pdf/i706a.pdf
Processing PDF: https://www.irs.gov/pub/irs-pdf/i706d.pdf
Processing PDF: https://www.irs.gov/pub/irs-pdf/i706gsd.pdf
Processing PDF: https://www.irs.gov/pub/irs-pdf/i706gsd1.pdf
Processing PDF: https://www.irs.gov/pub/irs-pdf/i706gst.pdf
Processing PDF: https://www.irs.gov/pub/irs-pdf/i706na.pdf
Processing PDF: https://www.irs.gov/pub/irs-pdf/i706qdt.pdf
Processing PDF: https://www.irs.gov/pub/irs-pdf/i709.pdf
Processing PDF: https://www.irs.gov/pub/irs-pdf/i720.pdf
Pro

PSSyntaxError: Invalid dictionary construct: [/'BG2', /'Default', /'OP', /b'fa', /b'lse', /'OPM', 1, /'SA', False, /'SM', 0.02, /'Type', /'ExtGState', /'UCR2', /'Default', /'op', False]

In [None]:
# TODO 1: Make this work with the user inputted PDF

from markdownify import markdownify as md

# Convert extracted text to Markdown (In-Memory)
pdf_markdown = [
    {"file_name": doc["file_name"].replace(".pdf", ".md"), "content": md(doc["content"])}
    for doc in pdf_texts
]

print(f"âœ… Converted {len(pdf_markdown)} PDFs to Markdown format")

âœ… Converted 408 PDFs to Markdown format


In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Initialize chunking
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)

# Chunk the Markdown documents
chunks = []
for doc in pdf_markdown:
    split_texts = text_splitter.split_text(doc["content"])
    for i, text in enumerate(split_texts):
        chunks.append({"file_name": doc["file_name"], "chunk_id": i, "content": text})

print(f"âœ… Created {len(chunks)} text chunks")

âœ… Created 70922 text chunks


In [None]:
import pinecone

# Initialize Pinecone
pc = pinecone.Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

# Set index name
index_name = "tax-rag"

# Check if the index already exists
existing_indexes = [index.name for index in pc.list_indexes()]
if index_name not in existing_indexes:
    print(f"Creating new Pinecone index: {index_name}")
    pc.create_index(
        name=index_name,
        spec=pinecone.ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        ),
        dimension=384,  # Make sure this matches your embeddings' dimension
        metric="cosine"
    )

# Connect to the existing index
index = pc.Index(index_name)
print(f"âœ… Connected to Pinecone index: {index_name}")

  from tqdm.autonotebook import tqdm


âœ… Connected to Pinecone index: tax-rag


In [None]:
from sentence_transformers import SentenceTransformer

# Initialize embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Convert each chunk into an embedding
chunk_embeddings = [
    {
        "file_name": chunk["file_name"],
        "chunk_id": chunk["chunk_id"],
        "content": chunk["content"],
        "embedding": embedding_model.encode(chunk["content"]).tolist()  # Ensure it's a list, not a NumPy array
    }
    for chunk in chunks
]

# Store in Pinecone with batching
def batch_upsert(index, vectors, batch_size=100):
    """Uploads embeddings in batches to avoid Pinecone's request size limit."""
    for i in range(0, len(vectors), batch_size):
        batch = vectors[i : i + batch_size]  # Get batch slice
        index.upsert(batch)
        print(f"âœ… Uploaded batch {i // batch_size + 1}/{(len(vectors) // batch_size) + 1}")

# Prepare vectors for Pinecone
vectors = [
    (f"{chunk['file_name']}_{chunk['chunk_id']}", chunk["embedding"], {"text": chunk["content"]})
    for chunk in chunk_embeddings
]

# Upload in batches
batch_upsert(index, vectors)

print("âœ… All embeddings successfully stored in Pinecone!")



âœ… Uploaded batch 1/710
âœ… Uploaded batch 2/710
âœ… Uploaded batch 3/710
âœ… Uploaded batch 4/710
âœ… Uploaded batch 5/710
âœ… Uploaded batch 6/710
âœ… Uploaded batch 7/710
âœ… Uploaded batch 8/710
âœ… Uploaded batch 9/710
âœ… Uploaded batch 10/710
âœ… Uploaded batch 11/710
âœ… Uploaded batch 12/710
âœ… Uploaded batch 13/710
âœ… Uploaded batch 14/710
âœ… Uploaded batch 15/710
âœ… Uploaded batch 16/710
âœ… Uploaded batch 17/710
âœ… Uploaded batch 18/710
âœ… Uploaded batch 19/710
âœ… Uploaded batch 20/710
âœ… Uploaded batch 21/710
âœ… Uploaded batch 22/710
âœ… Uploaded batch 23/710
âœ… Uploaded batch 24/710
âœ… Uploaded batch 25/710
âœ… Uploaded batch 26/710
âœ… Uploaded batch 27/710
âœ… Uploaded batch 28/710
âœ… Uploaded batch 29/710
âœ… Uploaded batch 30/710
âœ… Uploaded batch 31/710
âœ… Uploaded batch 32/710
âœ… Uploaded batch 33/710
âœ… Uploaded batch 34/710
âœ… Uploaded batch 35/710
âœ… Uploaded batch 36/710
âœ… Uploaded batch 37/710
âœ… Uploaded batch 38/710
âœ… Uploaded batch 39

In [None]:
query = "Which forms do I need for self-employment?"
query_embedding = embedding_model.encode(query).tolist()

results = index.query(vector=query_embedding, top_k=5, include_metadata=True)

# Print top results
for match in results["matches"]:
    print(f"ðŸ”¹ Score: {match['score']}")
    print(f"ðŸ“„ Text: {match['metadata']['text']}\n")

ðŸ”¹ Score: 0.617472708
ðŸ“„ Text: your share of the applicable income, deduction, or loss. Each of you must also file a separate Schedule SE (Form 1040), Self-Employment Tax, to pay self-employment tax, as applicable.

ðŸ”¹ Score: 0.598380625
ðŸ“„ Text: Schedule SE (Form 1040), Self-Employment Tax, to complete your return.
You may only need to file Form 1040-SS and none of the schedules. However, if your return is more complicated (for
example, you claim certain deductions or credits or owe additional taxes), you will need to complete one or more of the
schedules. Below is a general guide to which schedule(s) you will need to file based on your circumstances. See the

ðŸ”¹ Score: 0.586371124
ðŸ“„ Text: self-employment under the farm optional method on
Schedule SE (Form 1040), Part II.
Code C. Gross nonfarm income. If you're an individual
partner, use this amount to figure net earnings from
self-employment under the nonfarm optional method on
Schedule SE (Form 1040), Part II.
Box 15. C