In [27]:
import fitz
from langchain.chains.question_answering.map_rerank_prompt import output_parser
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain_core.output_parsers import PydanticOutputParser, JsonOutputParser
from langchain_openai import ChatOpenAI

embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
base_llm = ChatOpenAI(openai_api_key=st.secrets['hg_key'], model='gpt-4o-mini', temperature=0.5)


def extract_pdf_structure(file_path):
    doc = fitz.open(file_path)
    content = []

    for page in doc:
        blocks = page.get_text("dict")["blocks"]
        for block in blocks:
            if block['type'] == 0:  # Text block
                for line in block['lines']:
                    for span in line['spans']:
                        text = span['text']
                        font_size = span['size']
                        # Identify headings based on font size or style
                        if font_size > 15:
                            content.append(f"# {text}")  # H1
                        elif font_size > 12:
                            content.append(f"## {text}")  # H2
                        else:
                            content.append(text)
    return "\n".join(content)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000,
    chunk_overlap=100,
    separators=["\n## ", "\n# ", "\n", "."]
)



In [29]:
onsite_policy = extract_pdf_structure("../data/Mercury Onsite Policy.pdf")

onsite_policy_chunks = text_splitter.create_documents([onsite_policy])

In [30]:
default_policy = extract_pdf_structure("../data/Mercury_Expense_General_Policy.pdf")

default_policy_chunks = text_splitter.create_documents([default_policy])

In [31]:
len(default_policy_chunks)

14

In [32]:
default_policy_chunks

[Document(metadata={}, page_content='# 💳\n# Expense Policy (2025 updates)\nNote: This policy was announced at the MMM on 12/19/23. Q&A session on 01/04/24. \n💡\nMercury cards are issued with a $500 monthly spend limit. If you need an increase please submit a request in \n#io-credit-limit-changes-or-card-requests\n.'),
 Document(metadata={}, page_content="## Overview\nStarting 01/01/24, \nLunch Perks\n  will be taxed on a monthly basis (similar to wellness benefits) and must be categorized as \nsuch by the cardholder using Mercury.com’s Accounting Page or the mobile app\nAll, not just Lunch Perks, card transactions must be categorized \n(\n GL Code\n  is what we need) by the 1st of the following \nmonth\nFollow our \nCredit Card Categorization Policy\nThis (coupled with the need for a receipt on file for any transaction > $75) will align Mercury with IRS guidance. We \ndo not make these rules but are getting compliant with them\nThe standard $250 Daily Credit Limit will be switching to 

### VectorStore Time

In [33]:

import numpy as np
faiss_index = FAISS.from_documents(default_policy_chunks, embeddings)

In [34]:
faiss_index.similarity_search('Lunch Perk')

[Document(id='fc981a5b-018e-4991-bb16-065ce927fba6', metadata={}, page_content='## Mercury Card Uses\nAccounting page is available in Mercury (web)\n to better review and categorize all your IO transactions.\n GL Code\n  for each \ntransaction can be chosen (individually) via the latest iOS/Android app as well\nGL Code\n  is what we need. \nBecause of timely financial and payroll reporting needs, being out of office does not \nabsolve us from coding our transactions on time. Please plan accordingly\nLunch Perks\n  = $100 per week for Meals\nDo not double dip\n. The purpose of this perk is to provide lunch during your work day. If you are traveling for work or \non vacation or on leave, then you should not also spend your lunch perk on top of that\nIf you are over/under for a week, that’s okay and you can even it out the next week\nA bulk purchase for the month is okay too. Example: Spend $400 at Price Chopper and don’t use your card for \nLunch Perks for 4 weeks\nAlthough not an immedi

In [75]:
from langchain.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from pydantic import BaseModel, Field, HttpUrl
from typing import Optional, List, Literal
from enum import Enum
from langchain_core.output_parsers import JsonOutputParser, PydanticOutputParser
from langchain.output_parsers import OutputFixingParser

# Step 1: Create Embeddings and Load FAISS Vector Store
retriever = faiss_index.as_retriever()

class ApprovalResponse(BaseModel):
    policy_flag: Literal['Allowed', 'Disallowed', 'More Information Required'] = Field(..., description="Indicates the policy status of the transaction.")
    policy_explanation: str = Field(..., description="Relevant information from the policy documents justifying the policy_flag value.")
    recommendation: str = Field(..., description="Recommendation containing next steps for additional information, or updating categories/GL codes for the transaction.")

policy_llm = base_llm.with_structured_output(ApprovalResponse)
policy_parser = PydanticOutputParser(pydantic_object=ApprovalResponse)

# Step 2: Define a Custom Prompt Template
custom_prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="""
    You are an intelligent assistant that responds in strict JSON format with the following fields:
      - policy_flag: ['Allowed', 'Disallowed', 'More Information Required']
      - policy_explanation: Justification for the policy_flag value
      - recommendation: Next steps for resolving the transaction

    Context:
    {context}

    Question:
    {question}

    Respond in JSON format:
    """
)

output_fixing_parser = OutputFixingParser.from_llm(
    llm=base_llm,
    parser=policy_parser
)

qa_chain = RetrievalQA.from_chain_type(
    llm=base_llm,
    chain_type="stuff",
    retriever=retriever,
    chain_type_kwargs={"prompt": custom_prompt}
)

query = "United Airlines: $5500 Belarus"
answer = qa_chain.invoke(query)

result = output_fixing_parser.parse(answer.get('result'))


In [76]:
result

ApprovalResponse(policy_flag='Disallowed', policy_explanation='The expense of $5500 for United Airlines to Belarus exceeds the allowable reimbursement for airfare, which is capped at the cost of a flight. Additionally, no justification for this expense has been provided, and it does not align with the stated reimbursement policies.', recommendation='Please provide a detailed justification for this expense and ensure that it falls within the allowable reimbursement limits. If this is a legitimate business expense, consider resubmitting with proper documentation and within policy guidelines.')

In [77]:
def policy_chain(input=None):

    answer = qa_chain.invoke(input)
    result = output_fixing_parser.parse(answer.get('result'))

    return result

In [78]:
policy_chain("United Airlines: $5500 Belarus")

ApprovalResponse(policy_flag='Disallowed', policy_explanation='The expense of $5500 for United Airlines exceeds the allowable reimbursement for airfare, which must be up to the cost of a flight determined through Navan. There is no justification provided for this amount being within policy.', recommendation='Please provide a screenshot of comparable flight costs from Navan to determine the reimbursable amount. If the expense is for a specific company event, ensure it aligns with the approved budget and use the correct GL Code.')