In [1]:
# Manually setting keys while env is acting up - REMOVE BEFORE PUSH 
import os

os.environ["OPENAI_API_KEY"] = "HOLDER"
os.environ["PINECONE_API_KEY"] = "HOLDER"

In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Path to repo 
REPO_PATH = "/Users/halladaykinsey/AAI540_ML" 

def load_markdown_files(directory):
    """Load all Markdown files from a given directory and return their content as a list of documents."""
    documents = []
    for filename in os.listdir(directory):
        if filename.endswith(".md"):
            with open(os.path.join(directory, filename), "r", encoding="utf-8") as file:
                documents.append({"file_name": filename, "content": file.read()})
    return documents

docs = load_markdown_files(REPO_PATH)
print(f"Loaded {len(docs)} markdown files.")
print(f"Sample document: {docs[0] if docs else 'No documents found!'}")


Loaded 7 markdown files.
Sample document: {'file_name': 'w9.md', 'content': '# Instructions for the Internal Revenue Service\n\n# Requester of Form W-9 (Rev. March 2024)\n\n# Request for Taxpayer Identification Number and Certification\n\nSection references are to the Internal Revenue Code unless otherwise noted.\n\n# Future Developments\n\nFor the latest developments related to Form W-9 and its instructions, such as legislation enacted after they were published, go to IRS.gov/FormW9.\n\n# What‚Äôs New\n\nLine 3a. We clarified that a Limited Liability Company (LLC) that is a disregarded entity should fill out line 3a by checking the appropriate box for the tax classification of its owner in the first row on line 3a. We also added guidance that provides clarity for disregarded entities completing lines 1 and 2. For proper processing, information for disregarded entities is reported as the owner‚Äôs name on line 1, and the disregarded entity‚Äôs name is entered on line 2.\n\nFor an LLC t

In [3]:
# Splitting text into smaller chunks
def chunk_text(documents, chunk_size=500, overlap=100):
    """Splits documents into chunks with overlap."""
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
    chunked_docs = []
    
    for doc in documents:
        chunks = text_splitter.split_text(doc["content"])
        for i, chunk in enumerate(chunks):
            chunked_docs.append({
                "file_name": doc["file_name"],
                "chunk_id": i,
                "content": chunk
            })
    return chunked_docs

# Processing markdown files into chunks
chunks = chunk_text(docs)

# Printing sample chunk
print(f"Sample Chunk: {chunks[0]}")

Sample Chunk: {'file_name': 'w9.md', 'chunk_id': 0, 'content': '# Instructions for the Internal Revenue Service\n\n# Requester of Form W-9 (Rev. March 2024)\n\n# Request for Taxpayer Identification Number and Certification\n\nSection references are to the Internal Revenue Code unless otherwise noted.\n\n# Future Developments\n\nFor the latest developments related to Form W-9 and its instructions, such as legislation enacted after they were published, go to IRS.gov/FormW9.\n\n# What‚Äôs New'}


In [4]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

chunk_embeddings = [
    {
        "file_name": chunk["file_name"],
        "chunk_id": chunk["chunk_id"],
        "content": chunk["content"], 
        "embedding": embedding_model.encode(chunk["content"])
    }
    for chunk in chunks
]

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
import pinecone

# Initializing Pinecone 
pc = pinecone.Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

# Setting index name
index_name = "tax-rag"

if index_name not in [index.name for index in pc.list_indexes()]:
    pc.create_index(
        name=index_name, 
        spec=pinecone.ServerlessSpec(
            cloud="aws",  
            region="us-east-1"
        ), 
        dimension=384, 
        metric="cosine"
    )

# Connecting to index 
index = pc.Index(index_name)

print(f"Pinecone index '{index_name}' is ready!")

Pinecone index 'tax-rag' is ready!


In [6]:
# Preparing vectors for Pinecone
vectors = [
    (f"{chunk['file_name']}_{chunk['chunk_id']}", chunk["embedding"], {"text": chunk["content"]})
    for chunk in chunk_embeddings
]

In [7]:
# Function to batch and upload embeddings
def batch_upsert(index, vectors, batch_size=100):
    """Upserts embeddings in smaller batches to avoid Pinecone's request size limit."""
    for i in range(0, len(vectors), batch_size):
        batch = vectors[i : i + batch_size]  # Get batch slice
        index.upsert(batch)  # Upload batch
        print(f"Uploaded batch {i // batch_size + 1}/{(len(vectors) // batch_size) + 1}")

# Uploading embeddings in batches
batch_upsert(index, vectors)

print("‚úÖ All embeddings successfully stored in Pinecone!")

Uploaded batch 1/38
Uploaded batch 2/38
Uploaded batch 3/38
Uploaded batch 4/38
Uploaded batch 5/38
Uploaded batch 6/38
Uploaded batch 7/38
Uploaded batch 8/38
Uploaded batch 9/38
Uploaded batch 10/38
Uploaded batch 11/38
Uploaded batch 12/38
Uploaded batch 13/38
Uploaded batch 14/38
Uploaded batch 15/38
Uploaded batch 16/38
Uploaded batch 17/38
Uploaded batch 18/38
Uploaded batch 19/38
Uploaded batch 20/38
Uploaded batch 21/38
Uploaded batch 22/38
Uploaded batch 23/38
Uploaded batch 24/38
Uploaded batch 25/38
Uploaded batch 26/38
Uploaded batch 27/38
Uploaded batch 28/38
Uploaded batch 29/38
Uploaded batch 30/38
Uploaded batch 31/38
Uploaded batch 32/38
Uploaded batch 33/38
Uploaded batch 34/38
Uploaded batch 35/38
Uploaded batch 36/38
Uploaded batch 37/38
Uploaded batch 38/38
‚úÖ All embeddings successfully stored in Pinecone!


In [8]:
query = "What forms do I need to file as a self-employed individual?"
query_embedding = embedding_model.encode(query).tolist()

In [9]:
# Search Pinecone for the most relevant chunks
results = index.query(vector=query_embedding, top_k=5, include_metadata=True)

# Print top results
print("üîç Top matching tax instructions:")
for match in results["matches"]:
    print(f"üîπ Score: {match['score']}")
    print(f"üìÑ Text: {match['metadata']['text']}\n")

üîç Top matching tax instructions:
üîπ Score: 0.694014966
üìÑ Text: your share of the applicable income, deduction, or loss. Each of you must also file a separate Schedule SE (Form 1040), Self-Employment Tax, to pay self-employment tax, as applicable.

üîπ Score: 0.670240104
üìÑ Text: For more information on e-filing, see E-file for Business and Self-employed Taxpayers on IRS.gov.

# Waivers

üîπ Score: 0.609281301
üìÑ Text: |9. Enter the earned income you (and your spouse if filing jointly) received as a self-employed individual or a partner. Generally, this is your (and your spouse's if filing jointly) net earnings from self-employment if your personal services were a material income-producing factor, minus any deductions on Schedule 1, lines 15 and 16. If zero or less, enter -0-. For more details, see Pub. 590-A.| |
|10. Add lines 8 and 9.| |
|!|!|

üîπ Score: 0.562972605
üìÑ Text: You can order forms, instructions, and publications at IRS.gov/OrderForms. For any other tax 

In [10]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA

# Initializing OpenAI GPT model for testing
openai_api_key = os.getenv("OPENAI_API_KEY")
llm = ChatOpenAI(model_name="gpt-4", openai_api_key=openai_api_key)

# Converting retrieved text chunks into context
retrieved_texts = [match["metadata"]["text"] for match in results["matches"]]
context = "\n".join(retrieved_texts)

# Calling GPT-4 to respond using the retrieved context
response = llm.predict(f"Based on the following tax instructions, answer this question:\n\n{context}\n\nQuestion: {query}")

print("ü§ñ AI Response:")
print(response)

  llm = ChatOpenAI(model_name="gpt-4", openai_api_key=openai_api_key)
  response = llm.predict(f"Based on the following tax instructions, answer this question:\n\n{context}\n\nQuestion: {query}")


ü§ñ AI Response:
As a self-employed individual, you need to file a Schedule SE (Form 1040) for Self-Employment Tax and possibly Form 1065 if you are part of a domestic partnership. If you have one or more employees, you also need to file Form(s) W-2.
