### Fetch IRS PDF Links

In [4]:
# fetch_irs_pdf_links.py
import requests
from bs4 import BeautifulSoup

def get_pdf_links(base_url):
    res = requests.get(base_url)
    soup = BeautifulSoup(res.text, 'html.parser')
    links = []

    for a in soup.find_all('a', href=True):
        href = a['href']
        if 'pub' in href and href.endswith('.pdf'):
            full_url = href if href.startswith('http') else f"https://www.irs.gov{href}"
            links.append(full_url)

    return links

get_pdf_links('https://www.irs.gov/forms-pubs/about-form-941')

['https://www.irs.gov/pub/irs-pdf/i1040gi.pdf',
 'https://www.irs.gov/pub/irs-pdf/pcir230.pdf',
 'https://www.irs.gov/pub/irs-pdf/i1040gi.pdf',
 'https://www.irs.gov/pub/irs-pdf/pcir230.pdf',
 'https://www.irs.gov/pub/irs-pdf/f941.pdf',
 'https://www.irs.gov/pub/irs-pdf/i941.pdf',
 'https://www.irs.gov/pub/irs-pdf/f941sb.pdf',
 'https://www.irs.gov/pub/irs-pdf/i941sb.pdf',
 'https://www.irs.gov/pub/irs-pdf/f941sd.pdf',
 'https://www.irs.gov/pub/irs-pdf/i941sd.pdf',
 'https://www.irs.gov/pub/irs-pdf/f941sr.pdf',
 'https://www.irs.gov/pub/irs-pdf/i941sr.pdf']

### Scrape IRS PDFs Table

In [None]:
BASE_URL = "https://www.irs.gov/forms-instructions-and-publications"
DOWNLOAD_FOLDER = "./irs_docs"

def scrape_table():
    rows = []
    page = 0
    while True:
        url = f"{BASE_URL}?page={page}"
        res = requests.get(url)
        soup = BeautifulSoup(res.text, "html.parser")
        table = soup.find("table")
        if not table:
            break

        for tr in table.find_all("tr")[1:]:
            cols = tr.find_all("td")
            if len(cols) < 4:
                continue
            product_link = cols[0].find("a")
            if not product_link:
                continue

            revision = cols[2].text.strip()
            if not any(y in revision for y in ["2024", "2025"]):
                continue

            rows.append({
                "name": product_link.text.strip(),
                "title": cols[1].text.strip(),
                "revision_date": revision,
                "posted_date": cols[3].text.strip(),
                "pdf_url": "https://www.irs.gov" + product_link["href"]
            })

        # Pagination stops when no table found
        page += 1

    df = pd.DataFrame(rows)
    df.to_csv("output/filtered_irs_pubs_2024_2025.csv", index=False)
    print(f"✅ Scraped {len(df)} items from IRS")
    return df

df = scrape_table()

### Download IRS PDFs

In [None]:
# download_pdfs.py
import os
import requests

def download_pdfs(links, download_folder="./pdfs"):
    os.makedirs(download_folder, exist_ok=True)

    for link in links:
        filename = link.split('/')[-1]
        file_path = os.path.join(download_folder, filename)

        if not os.path.exists(file_path):
            print(f"Downloading {filename}")
            r = requests.get(link)
            with open(file_path, "wb") as f:
                f.write(r.content)


### Extract Chuncks

In [12]:
import pdfplumber
import re
import tiktoken

# Use GPT-3.5 encoding to estimate token size
enc = tiktoken.encoding_for_model("gpt-3.5-turbo")

toc = [
    {"title": "What's New", "start_page": 6},
    {"title": "Filing Requirements", "start_page": 8},
    {"title": "Line Instructions for Forms 1040 and 1040-SR", "start_page": 12},
    {"title": "Income", "start_page": 23},
    {"title": "Refund", "start_page": 57},
    {"title": "Sign Your Return", "start_page": 62},
    {"title": "2024 Tax Table", "start_page": 64},
    {"title": "General Information", "start_page": 77},
    {"title": "How To Get Tax Help", "start_page": 79},
    {"title": "Refund Information", "start_page": 83},
    {"title": "Instructions for Schedule 1", "start_page": 84},
    {"title": "Instructions for Schedule 2", "start_page": 96},
    {"title": "Instructions for Schedule 3", "start_page": 101},
    {"title": "Tax Topics", "start_page": 104},
    {"title": "Disclosure, Privacy Act...", "start_page": 106},
    {"title": "Major Categories of Income and Outlays", "start_page": 108},
    {"title": "Index", "start_page": 110}
]

def count_tokens(text):
    return len(enc.encode(text))

def subchunk_text(text, max_tokens=500):
    paragraphs = [p.strip() for p in text.split('\n') if len(p.strip()) > 30]
    chunks = []
    current = ""

    for para in paragraphs:
        if count_tokens(current + " " + para) < max_tokens:
            current += " " + para
        else:
            chunks.append(current.strip())
            current = para

    if current:
        chunks.append(current.strip())

    return chunks

def extract_by_semantic_toc(pdf_path, toc, doc_id="irs_1040_instr_2024"):
    with pdfplumber.open(pdf_path) as pdf:
        all_chunks = []

        for i in range(len(toc)):
            start_page = toc[i]['start_page'] - 1
            end_page = toc[i + 1]['start_page'] - 1 if i + 1 < len(toc) else len(pdf.pages)
            section_title = toc[i]['title']
            ...

            # Extract full section text
            section_text = ""
            for page in pdf.pages[start_page:end_page]:
                section_text += page.extract_text() + "\n"

            section_text = re.sub(r'\s{2,}', ' ', section_text).strip()
            subchunks = subchunk_text(section_text)

            for j, chunk in enumerate(subchunks):
                chunk_data = {
                    "doc_id": doc_id,
                    "section": section_title,
                    "section_chunk_index": j,
                    "text": chunk,
                    "start_page": start_page + 1,
                    "end_page": end_page,
                    "jurisdiction": "federal",
                    "source": "IRS 1040 Instructions 2024"
                }
                all_chunks.append(chunk_data)

        return all_chunks


chunks = extract_by_semantic_toc("pdfs/i1040gi.pdf", toc)

# Example: embed into Pinecone or store in JSON
import json
with open("1040_chunks.json", "w") as f:
    json.dump(chunks, f, indent=2)

In [14]:
import pdfplumber

with pdfplumber.open("pdfs/i1040gi.pdf") as pdf:
    page = pdf.pages[47]  # Page with EIC table
    tables = page.extract_tables()

print(tables)

[[['If the amount you\nare looking up from\nthe worksheet is–', None, 'And your filing status is–', None], [None, None, 'Single, head of household,\nor qualifying surviving\nspouse★ and you have–\n0 1 2 3', 'Married filing jointly and you\nhave–\n0 1 2 3'], ['At least', 'But less\nthan', 'Your credit is–', 'Your credit is–'], ['12,000 12,050', None, '502 4,089 4,810 5,411', '632 4,089 4,810 5,411'], ['12,050 12,100', None, '499 4,106 4,830 5,434', '632 4,106 4,830 5,434'], ['12,100 12,150', None, '495 4,123 4,850 5,456', '632 4,123 4,850 5,456'], ['12,150 12,200', None, '491 4,140 4,870 5,479', '632 4,140 4,870 5,479'], ['12,200 12,250\n12,250 12,300\n12,300 12,350\n12,350 12,400', None, '487 4,157 4,890 5,501\n483 4,174 4,910 5,524\n479 4,191 4,930 5,546\n476 4,213 4,950 5,569', '632 4,157 4,890 5,501\n632 4,174 4,910 5,524\n632 4,191 4,930 5,546\n632 4,213 4,950 5,569'], ['12,400 12,450', None, '472 4,213 4,970 5,591', '632 4,213 4,970 5,591'], ['12,450 12,500', None, '468 4,213 4,99

### Embeded and Store in Pinecone

In [None]:
# embed_store.py
import openai
import pinecone
import os

openai.api_key = os.getenv("OPENAI_API_KEY")
pinecone.init(api_key=os.getenv("PINECONE_API_KEY"), environment="us-west1-gcp")
index = pinecone.Index("tax-research")

def embed_and_store(chunks, metadata):
    for i, chunk in enumerate(chunks):
        response = openai.Embedding.create(
            input=chunk,
            model="text-embedding-ada-002"
        )
        vector = response['data'][0]['embedding']
        index.upsert([
            (f"{metadata['doc_id']}_{i}", vector, {
                "text": chunk,
                **metadata
            })
        ])
