In [6]:
# Manually setting keys while env is acting up - REMOVE BEFORE PUSH 
import os

os.environ["OPENAI_API_KEY"] = "HOLDER"
os.environ["PINECONE_API_KEY"] = "HOLDER"

#### Part 1: Testing RAG framework on 6 PDFs 

In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Path to repo 
REPO_PATH = "/Users/halladaykinsey/AAI540_ML" 

def load_markdown_files(directory):
    """Load all Markdown files from a given directory and return their content as a list of documents."""
    documents = []
    for filename in os.listdir(directory):
        if filename.endswith(".md"):
            with open(os.path.join(directory, filename), "r", encoding="utf-8") as file:
                documents.append({"file_name": filename, "content": file.read()})
    return documents

docs = load_markdown_files(REPO_PATH)
print(f"Loaded {len(docs)} markdown files.")
print(f"Sample document: {docs[0] if docs else 'No documents found!'}")


Loaded 7 markdown files.
Sample document: {'file_name': 'w9.md', 'content': '# Instructions for the Internal Revenue Service\n\n# Requester of Form W-9 (Rev. March 2024)\n\n# Request for Taxpayer Identification Number and Certification\n\nSection references are to the Internal Revenue Code unless otherwise noted.\n\n# Future Developments\n\nFor the latest developments related to Form W-9 and its instructions, such as legislation enacted after they were published, go to IRS.gov/FormW9.\n\n# What’s New\n\nLine 3a. We clarified that a Limited Liability Company (LLC) that is a disregarded entity should fill out line 3a by checking the appropriate box for the tax classification of its owner in the first row on line 3a. We also added guidance that provides clarity for disregarded entities completing lines 1 and 2. For proper processing, information for disregarded entities is reported as the owner’s name on line 1, and the disregarded entity’s name is entered on line 2.\n\nFor an LLC that is

In [3]:
# Splitting text into smaller chunks
def chunk_text(documents, chunk_size=500, overlap=100):
    """Splits documents into chunks with overlap."""
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
    chunked_docs = []
    
    for doc in documents:
        chunks = text_splitter.split_text(doc["content"])
        for i, chunk in enumerate(chunks):
            chunked_docs.append({
                "file_name": doc["file_name"],
                "chunk_id": i,
                "content": chunk
            })
    return chunked_docs

# Processing markdown files into chunks
chunks = chunk_text(docs)

# Printing sample chunk
print(f"Sample Chunk: {chunks[0]}")

Sample Chunk: {'file_name': 'w9.md', 'chunk_id': 0, 'content': '# Instructions for the Internal Revenue Service\n\n# Requester of Form W-9 (Rev. March 2024)\n\n# Request for Taxpayer Identification Number and Certification\n\nSection references are to the Internal Revenue Code unless otherwise noted.\n\n# Future Developments\n\nFor the latest developments related to Form W-9 and its instructions, such as legislation enacted after they were published, go to IRS.gov/FormW9.\n\n# What’s New'}


In [4]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

chunk_embeddings = [
    {
        "file_name": chunk["file_name"],
        "chunk_id": chunk["chunk_id"],
        "content": chunk["content"], 
        "embedding": embedding_model.encode(chunk["content"])
    }
    for chunk in chunks
]

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
import pinecone

# Initializing Pinecone 
pc = pinecone.Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

# Setting index name
index_name = "tax-rag"

if index_name not in [index.name for index in pc.list_indexes()]:
    pc.create_index(
        name=index_name, 
        spec=pinecone.ServerlessSpec(
            cloud="aws",  
            region="us-east-1"
        ), 
        dimension=384, 
        metric="cosine"
    )

# Connecting to index 
index = pc.Index(index_name)

print(f"Pinecone index '{index_name}' is ready!")

Pinecone index 'tax-rag' is ready!


In [6]:
# Preparing vectors for Pinecone
vectors = [
    (f"{chunk['file_name']}_{chunk['chunk_id']}", chunk["embedding"], {"text": chunk["content"]})
    for chunk in chunk_embeddings
]

In [7]:
# Function to batch and upload embeddings
def batch_upsert(index, vectors, batch_size=100):
    """Upserts embeddings in smaller batches to avoid Pinecone's request size limit."""
    for i in range(0, len(vectors), batch_size):
        batch = vectors[i : i + batch_size]  # Get batch slice
        index.upsert(batch)  # Upload batch
        print(f"Uploaded batch {i // batch_size + 1}/{(len(vectors) // batch_size) + 1}")

# Uploading embeddings in batches
batch_upsert(index, vectors)

print("✅ All embeddings successfully stored in Pinecone!")

Uploaded batch 1/38
Uploaded batch 2/38
Uploaded batch 3/38
Uploaded batch 4/38
Uploaded batch 5/38
Uploaded batch 6/38
Uploaded batch 7/38
Uploaded batch 8/38
Uploaded batch 9/38
Uploaded batch 10/38
Uploaded batch 11/38
Uploaded batch 12/38
Uploaded batch 13/38
Uploaded batch 14/38
Uploaded batch 15/38
Uploaded batch 16/38
Uploaded batch 17/38
Uploaded batch 18/38
Uploaded batch 19/38
Uploaded batch 20/38
Uploaded batch 21/38
Uploaded batch 22/38
Uploaded batch 23/38
Uploaded batch 24/38
Uploaded batch 25/38
Uploaded batch 26/38
Uploaded batch 27/38
Uploaded batch 28/38
Uploaded batch 29/38
Uploaded batch 30/38
Uploaded batch 31/38
Uploaded batch 32/38
Uploaded batch 33/38
Uploaded batch 34/38
Uploaded batch 35/38
Uploaded batch 36/38
Uploaded batch 37/38
Uploaded batch 38/38
✅ All embeddings successfully stored in Pinecone!


In [8]:
query = "What forms do I need to file as a self-employed individual?"
query_embedding = embedding_model.encode(query).tolist()

In [9]:
# Search Pinecone for the most relevant chunks
results = index.query(vector=query_embedding, top_k=5, include_metadata=True)

# Print top results
print("🔍 Top matching tax instructions:")
for match in results["matches"]:
    print(f"🔹 Score: {match['score']}")
    print(f"📄 Text: {match['metadata']['text']}\n")

🔍 Top matching tax instructions:
🔹 Score: 0.694014966
📄 Text: your share of the applicable income, deduction, or loss. Each of you must also file a separate Schedule SE (Form 1040), Self-Employment Tax, to pay self-employment tax, as applicable.

🔹 Score: 0.670240104
📄 Text: For more information on e-filing, see E-file for Business and Self-employed Taxpayers on IRS.gov.

# Waivers

🔹 Score: 0.609281301
📄 Text: |9. Enter the earned income you (and your spouse if filing jointly) received as a self-employed individual or a partner. Generally, this is your (and your spouse's if filing jointly) net earnings from self-employment if your personal services were a material income-producing factor, minus any deductions on Schedule 1, lines 15 and 16. If zero or less, enter -0-. For more details, see Pub. 590-A.| |
|10. Add lines 8 and 9.| |
|!|!|

🔹 Score: 0.562972605
📄 Text: You can order forms, instructions, and publications at IRS.gov/OrderForms. For any other tax information, go to IRS.gov/

In [10]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA

# Initializing OpenAI GPT model for testing
openai_api_key = os.getenv("OPENAI_API_KEY")
llm = ChatOpenAI(model_name="gpt-4", openai_api_key=openai_api_key)

# Converting retrieved text chunks into context
retrieved_texts = [match["metadata"]["text"] for match in results["matches"]]
context = "\n".join(retrieved_texts)

# Calling GPT-4 to respond using the retrieved context
response = llm.predict(f"Based on the following tax instructions, answer this question:\n\n{context}\n\nQuestion: {query}")

print("🤖 AI Response:")
print(response)

  llm = ChatOpenAI(model_name="gpt-4", openai_api_key=openai_api_key)
  response = llm.predict(f"Based on the following tax instructions, answer this question:\n\n{context}\n\nQuestion: {query}")


🤖 AI Response:
As a self-employed individual, you need to file a Schedule SE (Form 1040) for Self-Employment Tax and possibly Form 1065 if you are part of a domestic partnership. If you have one or more employees, you also need to file Form(s) W-2.


#### Part 2: Reading PDFs from IRS site, converting to md, chunking & storing vectors

In [1]:
import requests
from bs4 import BeautifulSoup
from pdfminer.high_level import extract_text
from io import BytesIO
from urllib.parse import urljoin

# Defining IRS base URL
IRS_URL = "https://www.irs.gov/instructions"

# Fetching IRS instructions page
response = requests.get(IRS_URL)
if response.status_code != 200:
    raise Exception(f"Failed to fetch IRS instructions page: {response.status_code}")

# Parse HTML
soup = BeautifulSoup(response.text, "html.parser")

# Find all PDF links
pdf_texts = []
for link in soup.find_all("a", href=True):
    href = link["href"]
    if href.lower().endswith(".pdf"):  # Ensure it's a PDF link
        pdf_url = urljoin(IRS_URL, href)
        print(f"Processing PDF: {pdf_url}")

        # Streaming PDF instead of downloading
        pdf_response = requests.get(pdf_url, stream=True)
        if pdf_response.status_code == 200:
            pdf_bytes = BytesIO(pdf_response.content)  # Converting to a file-like object
            text = extract_text(pdf_bytes)  # Extracting text directly
            pdf_texts.append({"file_name": pdf_url.split("/")[-1], "content": text})

print(f"✅ Processed {len(pdf_texts)} PDFs without storing them locally")

Processing PDF: https://www.irs.gov/pub/irs-pdf/i1040gi.pdf
Processing PDF: https://www.irs.gov/pub/irs-pdf/pcir230.pdf
Processing PDF: https://www.irs.gov/pub/irs-pdf/i1040gi.pdf
Processing PDF: https://www.irs.gov/pub/irs-pdf/pcir230.pdf
Processing PDF: https://www.irs.gov/pub/irs-pdf/i56.pdf
Processing PDF: https://www.irs.gov/pub/irs-pdf/i172.pdf
Processing PDF: https://www.irs.gov/pub/irs-pdf/i461.pdf
Processing PDF: https://www.irs.gov/pub/irs-pdf/i706.pdf
Processing PDF: https://www.irs.gov/pub/irs-pdf/i706a.pdf
Processing PDF: https://www.irs.gov/pub/irs-pdf/i706d.pdf
Processing PDF: https://www.irs.gov/pub/irs-pdf/i706gsd.pdf
Processing PDF: https://www.irs.gov/pub/irs-pdf/i706gsd1.pdf
Processing PDF: https://www.irs.gov/pub/irs-pdf/i706gst.pdf
Processing PDF: https://www.irs.gov/pub/irs-pdf/i706na.pdf
Processing PDF: https://www.irs.gov/pub/irs-pdf/i706qdt.pdf
Processing PDF: https://www.irs.gov/pub/irs-pdf/i709.pdf
Processing PDF: https://www.irs.gov/pub/irs-pdf/i720.pdf
Pro

In [3]:
!pip install markdownify

Collecting markdownify
  Downloading markdownify-0.14.1-py3-none-any.whl.metadata (8.5 kB)
Downloading markdownify-0.14.1-py3-none-any.whl (11 kB)
Installing collected packages: markdownify
Successfully installed markdownify-0.14.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [4]:
from markdownify import markdownify as md

# Convert extracted text to Markdown (In-Memory)
pdf_markdown = [
    {"file_name": doc["file_name"].replace(".pdf", ".md"), "content": md(doc["content"])}
    for doc in pdf_texts
]

print(f"✅ Converted {len(pdf_markdown)} PDFs to Markdown format")

✅ Converted 408 PDFs to Markdown format


In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Initialize chunking
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)

# Chunk the Markdown documents
chunks = []
for doc in pdf_markdown:
    split_texts = text_splitter.split_text(doc["content"])
    for i, text in enumerate(split_texts):
        chunks.append({"file_name": doc["file_name"], "chunk_id": i, "content": text})

print(f"✅ Created {len(chunks)} text chunks")

✅ Created 70922 text chunks


In [7]:
import pinecone

# Initialize Pinecone
pc = pinecone.Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

# Set index name
index_name = "tax-rag"

# Check if the index already exists
existing_indexes = [index.name for index in pc.list_indexes()]
if index_name not in existing_indexes:
    print(f"Creating new Pinecone index: {index_name}")
    pc.create_index(
        name=index_name,
        spec=pinecone.ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        ),
        dimension=384,  # Make sure this matches your embeddings' dimension
        metric="cosine"
    )

# Connect to the existing index
index = pc.Index(index_name)
print(f"✅ Connected to Pinecone index: {index_name}")

  from tqdm.autonotebook import tqdm


✅ Connected to Pinecone index: tax-rag


In [8]:
from sentence_transformers import SentenceTransformer

# Initialize embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Convert each chunk into an embedding
chunk_embeddings = [
    {
        "file_name": chunk["file_name"],
        "chunk_id": chunk["chunk_id"],
        "content": chunk["content"],
        "embedding": embedding_model.encode(chunk["content"]).tolist()  # Ensure it's a list, not a NumPy array
    }
    for chunk in chunks
]

# Store in Pinecone with batching
def batch_upsert(index, vectors, batch_size=100):
    """Uploads embeddings in batches to avoid Pinecone's request size limit."""
    for i in range(0, len(vectors), batch_size):
        batch = vectors[i : i + batch_size]  # Get batch slice
        index.upsert(batch)
        print(f"✅ Uploaded batch {i // batch_size + 1}/{(len(vectors) // batch_size) + 1}")

# Prepare vectors for Pinecone
vectors = [
    (f"{chunk['file_name']}_{chunk['chunk_id']}", chunk["embedding"], {"text": chunk["content"]})
    for chunk in chunk_embeddings
]

# Upload in batches
batch_upsert(index, vectors)

print("✅ All embeddings successfully stored in Pinecone!")



✅ Uploaded batch 1/710
✅ Uploaded batch 2/710
✅ Uploaded batch 3/710
✅ Uploaded batch 4/710
✅ Uploaded batch 5/710
✅ Uploaded batch 6/710
✅ Uploaded batch 7/710
✅ Uploaded batch 8/710
✅ Uploaded batch 9/710
✅ Uploaded batch 10/710
✅ Uploaded batch 11/710
✅ Uploaded batch 12/710
✅ Uploaded batch 13/710
✅ Uploaded batch 14/710
✅ Uploaded batch 15/710
✅ Uploaded batch 16/710
✅ Uploaded batch 17/710
✅ Uploaded batch 18/710
✅ Uploaded batch 19/710
✅ Uploaded batch 20/710
✅ Uploaded batch 21/710
✅ Uploaded batch 22/710
✅ Uploaded batch 23/710
✅ Uploaded batch 24/710
✅ Uploaded batch 25/710
✅ Uploaded batch 26/710
✅ Uploaded batch 27/710
✅ Uploaded batch 28/710
✅ Uploaded batch 29/710
✅ Uploaded batch 30/710
✅ Uploaded batch 31/710
✅ Uploaded batch 32/710
✅ Uploaded batch 33/710
✅ Uploaded batch 34/710
✅ Uploaded batch 35/710
✅ Uploaded batch 36/710
✅ Uploaded batch 37/710
✅ Uploaded batch 38/710
✅ Uploaded batch 39/710
✅ Uploaded batch 40/710
✅ Uploaded batch 41/710
✅ Uploaded batch 42/710
✅

In [9]:
query = "Which forms do I need for self-employment?"
query_embedding = embedding_model.encode(query).tolist()

results = index.query(vector=query_embedding, top_k=5, include_metadata=True)

# Print top results
for match in results["matches"]:
    print(f"🔹 Score: {match['score']}")
    print(f"📄 Text: {match['metadata']['text']}\n")

🔹 Score: 0.617472708
📄 Text: your share of the applicable income, deduction, or loss. Each of you must also file a separate Schedule SE (Form 1040), Self-Employment Tax, to pay self-employment tax, as applicable.

🔹 Score: 0.598380625
📄 Text: Schedule SE (Form 1040), Self-Employment Tax, to complete your return.
You may only need to file Form 1040-SS and none of the schedules. However, if your return is more complicated (for
example, you claim certain deductions or credits or owe additional taxes), you will need to complete one or more of the
schedules. Below is a general guide to which schedule(s) you will need to file based on your circumstances. See the

🔹 Score: 0.586371124
📄 Text: self-employment under the farm optional method on
Schedule SE (Form 1040), Part II.
Code C. Gross nonfarm income. If you're an individual
partner, use this amount to figure net earnings from
self-employment under the nonfarm optional method on
Schedule SE (Form 1040), Part II.
Box 15. Credits
If you have