In [65]:
import os
import shutil
import json
import requests
import fitz
import nltk
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from sentence_transformers import SentenceTransformer, CrossEncoder, util
from chromadb.config import Settings
from langchain.prompts import PromptTemplate
from langchain_openai import ChatOpenAI

# Environment setup

In [2]:
def setup_environment():
    import sys
    sys.path.append('C:\\gitworkspace\\aimldemo\\jupyterworkapce')
    import stratup_env_setup
    stratup_env_setup.set_env()

In [10]:
setup_environment()

In [11]:
class HuggingFaceEmbeddings:
    """Custom wrapper for Hugging Face SentenceTransformer embeddings."""
    def __init__(self, model_name="sentence-transformers/all-MiniLM-L6-v2"):
        self.embedding_model = SentenceTransformer(model_name)
    
    def embed_documents(self, texts):
        """Generate embeddings for a list of documents."""
        return self.embedding_model.encode(texts, convert_to_tensor=False).tolist()
    
    def embed_query(self, text):
        """Generate embedding for a single query."""
        return self.embedding_model.encode(text, convert_to_tensor=False).tolist()

In [18]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jimmy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [30]:
chunk_size=500
chunk_overlap=50

In [66]:
pdf_path = "C:\\gitworkspace\\aimldemo\\AIML Demo\\Python Demo\\pythonbasics\\doc_inputs\\Form_2287.pdf"
output_folder = "C:\\gitworkspace\\aimldemo\\AIML Demo\\Python Demo\\pythonbasics\\doc_outputs"

In [67]:
highlighted_pdf_path = os.path.join(output_folder, "highlighted_output.pdf")

In [68]:
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.7)

# Vectorize PDF Document

In [5]:
pdf_path = "C:\\gitworkspace\\aimldemo\\AIML Demo\\Python Demo\\pythonbasics\\doc_inputs\\Form_2287.pdf"

In [6]:
db_path = "chroma_db"

In [9]:
if os.path.exists(db_path):
        print(f"[INFO] Deleting existing Chroma database at {db_path} to avoid conflicts.")
        shutil.rmtree(db_path, ignore_errors=True)

[INFO] Deleting existing Chroma database at chroma_db to avoid conflicts.


In [14]:
embeddings = HuggingFaceEmbeddings()

In [15]:
chroma_settings = Settings(persist_directory=db_path, anonymized_telemetry=False)

In [20]:
vector_db = Chroma(persist_directory=db_path, embedding_function=embeddings, client_settings=chroma_settings)

In [21]:
def extract_chunks_with_coordinates(pdf_path, chunk_size=chunk_size, chunk_overlap=chunk_overlap):
    """Extract chunks from the PDF along with their coordinates (bounding boxes)."""
    doc = fitz.open(pdf_path)
    chunks_with_metadata = []

    for page_num in range(len(doc)):
        page = doc[page_num]
        page_text = page.get_text("blocks")
        full_text = " ".join(block[4] for block in page_text)

        splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        text_chunks = splitter.split_text(full_text)

        for chunk in text_chunks:
            coordinates = []
            search_results = page.search_for(chunk)
            for rect in search_results:
                coordinates.append((rect.x0, rect.y0, rect.x1, rect.y1))

            chunks_with_metadata.append({
                "text": chunk,
                "page": page_num,
                "coordinates": coordinates
            })

    return chunks_with_metadata

In [22]:
text_chunks_with_metadata = extract_chunks_with_coordinates(pdf_path)

In [23]:
text_chunks_with_metadata[0]

{'text': 'SBA Form 2287 (04-18) \n \n          1 \n Previous Editions Obsolete \n  \n \n  \n  \n  \n THIRD PARTY LENDER AGREEMENT \n  \n \n \nTHIS THIRD PARTY LENDER AGREEMENT (“Agreement") is dated this ______ day \nof______________, 20____, by and between ________________________________________, \n(“Third Party Lender”) whose address is ____________________________________________, \nand________________________________________________________, (“CDC”) whose address is',
 'page': 0,
 'coordinates': [(74.28336334228516,
   736.0769653320312,
   161.34852600097656,
   747.1239013671875),
  (287.99462890625, 736.0769653320312, 290.229736328125, 747.1239013671875),
  (395.9959411621094, 736.0769653320312, 424.5137939453125, 747.1239013671875),
  (72.0, 745.435546875, 172.51608276367188, 756.4824829101562),
  (141.0, 82.16398620605469, 144.0, 98.17198944091797),
  (177.0, 82.16398620605469, 180.0, 98.17198944091797),
  (138.72000122070312,
   107.96397399902344,
   141.72000122070312,
   

In [24]:
chunk_texts = [item["text"] for item in text_chunks_with_metadata]
metadata = [{"page": item["page"], "coordinates": json.dumps(item["coordinates"])} for item in text_chunks_with_metadata]

In [26]:
chunk_texts[0]

'SBA Form 2287 (04-18) \n \n          1 \n Previous Editions Obsolete \n  \n \n  \n  \n  \n THIRD PARTY LENDER AGREEMENT \n  \n \n \nTHIS THIRD PARTY LENDER AGREEMENT (“Agreement") is dated this ______ day \nof______________, 20____, by and between ________________________________________, \n(“Third Party Lender”) whose address is ____________________________________________, \nand________________________________________________________, (“CDC”) whose address is'

In [27]:
metadata[0]

{'page': 0,
 'coordinates': '[[74.28336334228516, 736.0769653320312, 161.34852600097656, 747.1239013671875], [287.99462890625, 736.0769653320312, 290.229736328125, 747.1239013671875], [395.9959411621094, 736.0769653320312, 424.5137939453125, 747.1239013671875], [72.0, 745.435546875, 172.51608276367188, 756.4824829101562], [141.0, 82.16398620605469, 144.0, 98.17198944091797], [177.0, 82.16398620605469, 180.0, 98.17198944091797], [138.72000122070312, 107.96397399902344, 141.72000122070312, 123.97197723388672], [72.0, 133.7639617919922, 75.0, 149.77195739746094], [306.0, 159.56394958496094, 309.0, 175.5719451904297], [155.0399932861328, 184.8782196044922, 168.40667724609375, 212.6336212158203], [168.36000061035156, 189.1377410888672, 214.11094665527344, 211.24234008789062], [214.0800018310547, 184.8782196044922, 226.32444763183594, 212.6336212158203], [226.32000732421875, 189.1377410888672, 275.67633056640625, 211.24234008789062], [275.760009765625, 184.8782196044922, 289.1266784667969, 2

In [28]:
vector_db.add_texts(chunk_texts, metadatas=metadata)

['3a73adcd-f2e4-47e7-8543-749217f82630',
 'a05764fa-7e94-4aaf-9a5d-4fd9d4af6400',
 'eeb75dbf-7139-4196-a7d9-5451973bbd6d',
 '79ad2077-0c55-4e2f-a1db-f170b8b7bff2',
 '46ea2def-e371-4183-b7c4-5a940617ce4c',
 '2803179f-cd58-46da-ba98-0f5d44e6f966',
 'bf8a6693-a6a7-40f7-b9d1-1713cbba9be9',
 '73666281-ccfb-4e9a-a578-cd8af10cc18a',
 '51f56c96-5fe8-4cce-9de5-819df3911b4c',
 'f2971dcd-f97a-4178-9c40-c02fa45db1e0',
 'e121d4ad-b1d6-456a-ba1c-59c6f7a3005b',
 '30bf1265-8bc6-476f-b854-c4371e6b0484',
 '67d0200e-a83b-426f-9cb4-0fd526c5eec8',
 'c9f9bd51-ec84-49ec-8a50-111068c4a806',
 'f5131a96-4503-496c-871a-ae73116d043e',
 'f3bb3d44-8c35-446b-8f66-40db0a63a0a0',
 'c2c80f12-f8e1-4641-9840-e375c71231d9',
 '7878c08a-20db-4272-bf8e-66ef9d02a8ab',
 '7e48fa80-de52-4604-8b67-4711b0138a47',
 '6fd5076b-7783-48e6-9123-4d488b6cef76',
 '885d2643-0d0b-4ff6-b36f-5c635e2bf767',
 'c6ce05df-4b38-4464-a839-a2f643d5c716',
 'd4744246-eefb-4caf-ac2e-0326285d0858',
 'b4830c09-0e78-48ea-afed-a0af188ac50d',
 'f2fe4a78-e1be-

In [29]:
vector_db.persist()

  vector_db.persist()


# Chat Query

In [34]:
query = 'What does this document say about Balloon payments?'

In [60]:
chat_history = [
    "CDC agrees to make the 504 Loan to the Borrower subject to what?",
    "SBA's approval.",
    "Third Party Lender waives its rights for what?",
    "Third Party Lender waives its rights to enforce provisions that do not comply with the 504 Loan Program Requirements."    
]

In [61]:
llm_instruction = 'Please answer the question by carefully considering both the provided context and the chat history. Use the context for accurate information and take into account any relevant details from the chat history to generate a well-informed response. If enough information is not avaiable, please respond saying There is not enough information to answer the question'

# Fetch relevent chunks from Vector DB

In [38]:
top_chunks_with_scores = vector_db.similarity_search_with_score(query, k=3)

In [39]:
top_chunks_with_scores

[(Document(metadata={'coordinates': '[[107.99996948242188, 285.3695983886719, 534.00146484375, 300.0969543457031], [107.99996948242188, 297.96624755859375, 532.0783081054688, 312.693603515625], [107.99996948242188, 310.684326171875, 413.28912353515625, 325.41168212890625], [432.012939453125, 310.684326171875, 434.77294921875, 325.41168212890625], [108.0, 323.2948913574219, 110.76000213623047, 338.0222473144531], [108.0, 336.014892578125, 110.76000213623047, 350.74224853515625], [144.00143432617188, 336.014892578125, 153.12046813964844, 350.74224853515625], [180.00286865234375, 336.014892578125, 525.3757934570312, 350.74224853515625], [108.0, 348.6148681640625, 503.7100524902344, 363.34222412109375]]', 'page': 2}, page_content='overall loan maturity must be calculated, taking into account the amounts and maturities of each \nloan.  Any balloon payment for the Third Party Loan must be clearly identified and disclosed to \nSBA and approved at application or subsequently approved by SBA. \

In [40]:
# Extract chunk texts for cross-encoding
top_chunks = [chunk.page_content for chunk, _ in top_chunks_with_scores]

In [41]:
cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

In [43]:
query_chunk_pairs = [(query, chunk) for chunk in top_chunks]

In [44]:
query_chunk_pairs

[('What does this document say about Balloon payments?',
  'overall loan maturity must be calculated, taking into account the amounts and maturities of each \nloan.  Any balloon payment for the Third Party Loan must be clearly identified and disclosed to \nSBA and approved at application or subsequently approved by SBA. \n \n  \n \nf. \nReasonable Interest Rate.  The Third Party Loan has a reasonable interest rate \n which does not and will not exceed the maximum interest rate for Third Party Loans from'),
 ('What does this document say about Balloon payments?',
  'Provisions.      \n  \n \n \n  \n \na. \nThe term "Default Charges" used in this paragraph includes, but is not limited to, \nprepayment penalties, late fees, other default charges, and escalated interest after default due under \nthe Third Party Loan.'),
 ('What does this document say about Balloon payments?',
  "commercial financial institutions as published periodically by SBA in the Federal Register and in \neffect as of

In [45]:
cross_encoded_scores = cross_encoder.predict(query_chunk_pairs)

In [47]:
# Pair chunks with their cross-encoder scores and sort them in descending order
ranked_chunks_with_scores = sorted(
    zip(top_chunks_with_scores, cross_encoded_scores),
    key=lambda x: x[1],  # Sort by score (descending)
    reverse=True
)

In [51]:
top_chunk_cross_encoded, top_cross_encoded_score = ranked_chunks_with_scores[0][0]

In [57]:
metadata_of_top_chunk = top_chunk_cross_encoded.metadata

In [58]:
top_chunk_cross_encoded

Document(metadata={'coordinates': '[[107.99996948242188, 285.3695983886719, 534.00146484375, 300.0969543457031], [107.99996948242188, 297.96624755859375, 532.0783081054688, 312.693603515625], [107.99996948242188, 310.684326171875, 413.28912353515625, 325.41168212890625], [432.012939453125, 310.684326171875, 434.77294921875, 325.41168212890625], [108.0, 323.2948913574219, 110.76000213623047, 338.0222473144531], [108.0, 336.014892578125, 110.76000213623047, 350.74224853515625], [144.00143432617188, 336.014892578125, 153.12046813964844, 350.74224853515625], [180.00286865234375, 336.014892578125, 525.3757934570312, 350.74224853515625], [108.0, 348.6148681640625, 503.7100524902344, 363.34222412109375]]', 'page': 2}, page_content='overall loan maturity must be calculated, taking into account the amounts and maturities of each \nloan.  Any balloon payment for the Third Party Loan must be clearly identified and disclosed to \nSBA and approved at application or subsequently approved by SBA. \n 

In [59]:
metadata_of_top_chunk

{'coordinates': '[[107.99996948242188, 285.3695983886719, 534.00146484375, 300.0969543457031], [107.99996948242188, 297.96624755859375, 532.0783081054688, 312.693603515625], [107.99996948242188, 310.684326171875, 413.28912353515625, 325.41168212890625], [432.012939453125, 310.684326171875, 434.77294921875, 325.41168212890625], [108.0, 323.2948913574219, 110.76000213623047, 338.0222473144531], [108.0, 336.014892578125, 110.76000213623047, 350.74224853515625], [144.00143432617188, 336.014892578125, 153.12046813964844, 350.74224853515625], [180.00286865234375, 336.014892578125, 525.3757934570312, 350.74224853515625], [108.0, 348.6148681640625, 503.7100524902344, 363.34222412109375]]',
 'page': 2}

# LLM query

In [71]:
# Create the prompt
template = """
Context: 
{context}

Chat history:
{chat_history}

Instruction: 
{llm_instruction}

User question: {question}
"""

In [72]:
formatted_prompt = PromptTemplate.from_template(template).format(
            context=top_chunk_cross_encoded.page_content, chat_history="\n".join(chat_history), question=query, llm_instruction=llm_instruction
        )

In [73]:
formatted_prompt

"\nContext: \noverall loan maturity must be calculated, taking into account the amounts and maturities of each \nloan.  Any balloon payment for the Third Party Loan must be clearly identified and disclosed to \nSBA and approved at application or subsequently approved by SBA. \n \n  \n \nf. \nReasonable Interest Rate.  The Third Party Loan has a reasonable interest rate \n which does not and will not exceed the maximum interest rate for Third Party Loans from\n\nChat history:\nCDC agrees to make the 504 Loan to the Borrower subject to what?\nSBA's approval.\nThird Party Lender waives its rights for what?\nThird Party Lender waives its rights to enforce provisions that do not comply with the 504 Loan Program Requirements.\n\nInstruction: \nPlease answer the question by carefully considering both the provided context and the chat history. Use the context for accurate information and take into account any relevant details from the chat history to generate a well-informed response. If enoug

In [91]:
# Get response from LLM
llm_response = llm.predict(formatted_prompt)

In [92]:
print("#"*50)
print(llm_response)

##################################################
This document indicates that any balloon payment for the Third Party Loan must be clearly identified and disclosed to SBA and approved at application or subsequently approved by SBA.


# PDF Highlighting

In [156]:
# Configurable cosine similarity threshold
COSINE_SIMILARITY_THRESHOLD = 0.5

In [117]:
page_num =  metadata_of_top_chunk["page"]

In [118]:
page_num

2

In [131]:
coordinates_str = metadata_of_top_chunk["coordinates"]

In [134]:
coordinates_str

'[[107.99996948242188, 285.3695983886719, 534.00146484375, 300.0969543457031], [107.99996948242188, 297.96624755859375, 532.0783081054688, 312.693603515625], [107.99996948242188, 310.684326171875, 413.28912353515625, 325.41168212890625], [432.012939453125, 310.684326171875, 434.77294921875, 325.41168212890625], [108.0, 323.2948913574219, 110.76000213623047, 338.0222473144531], [108.0, 336.014892578125, 110.76000213623047, 350.74224853515625], [144.00143432617188, 336.014892578125, 153.12046813964844, 350.74224853515625], [180.00286865234375, 336.014892578125, 525.3757934570312, 350.74224853515625], [108.0, 348.6148681640625, 503.7100524902344, 363.34222412109375]]'

In [165]:
coordinates = json.loads(coordinates_str)

In [166]:
coordinates

[[107.99996948242188, 285.3695983886719, 534.00146484375, 300.0969543457031],
 [107.99996948242188, 297.96624755859375, 532.0783081054688, 312.693603515625],
 [107.99996948242188,
  310.684326171875,
  413.28912353515625,
  325.41168212890625],
 [432.012939453125, 310.684326171875, 434.77294921875, 325.41168212890625],
 [108.0, 323.2948913574219, 110.76000213623047, 338.0222473144531],
 [108.0, 336.014892578125, 110.76000213623047, 350.74224853515625],
 [144.00143432617188,
  336.014892578125,
  153.12046813964844,
  350.74224853515625],
 [180.00286865234375, 336.014892578125, 525.3757934570312, 350.74224853515625],
 [108.0, 348.6148681640625, 503.7100524902344, 363.34222412109375]]

In [136]:
pdf_path

'C:\\gitworkspace\\aimldemo\\AIML Demo\\Python Demo\\pythonbasics\\doc_inputs\\Form_2287.pdf'

In [137]:
highlighted_pdf_path

'C:\\gitworkspace\\aimldemo\\AIML Demo\\Python Demo\\pythonbasics\\doc_outputs\\highlighted_output.pdf'

In [138]:
doc = fitz.open(pdf_path)

In [139]:
page = doc.load_page(page_num)

In [140]:
type(page) 

pymupdf.Page

In [141]:
page.get_text()

' \n \n \n \n SBA Form 2287 (04-18) \n \n          3 \n Previous Editions Obsolete \nfailure to pay taxes when due or violation of  any financial covenants which would cause a prudent \nlender to believe that the  prospect of payment or performance of the Third Party Note is impaired. \n \n \nc. \nNo Cross-Collateralization.   Third Party Lender agrees that the Common Collateral \nwill only secure its Third Party Loan and the Common Collateral is not currently, and will not be \nused in the future, as security for any other financing provided by Third Party Lender to Borrower \nthat purports to be in a superior position to that of the CDC Lien, unless authorized in writing by \nCDC and SBA. \n \n \nd. \nNo Cross-Default.   During the term of the 504 Loan, Third Party Lender will not \nexercise any cross-default, "deem at-risk," or any other provisions in documents evidencing the Third \nParty Loan or Third Party Lender Lien which allow Third Party Lender to make demand on the \nThird P

In [142]:
chunk_text = " ".join(page.get_textbox(fitz.Rect(*coord)) for coord in coordinates)

In [143]:
chunk_text

'504 loan is for 20 or 25 years).  If the Third Party Lender has made more than one loan, then an \noverall loan maturity must be calculated, taking into account the amounts and maturities of each \nloan.  Any balloon payment for the Third Party Loan must be clearly identified and disclosed to  overall loan maturity must be calculated, taking into account the amounts and maturities of each \nloan.  Any balloon payment for the Third Party Loan must be clearly identified and disclosed to \nSBA and approved at application or subsequently approved by SBA. \n  loan.  Any balloon payment for the Third Party Loan must be clearly \nSBA and approved at application or subsequently approved by SBA. \n  nt\n  S\n \n   \n \nw f. \noe Reasonable Interest Rate.  The Third Party Loan has a reasonable interest rate \nand will not exceed the maximum interest rate for Third Party Loans from   \nf. \nReasonable Interest Rate.  The Third Party Loan has a reasonable interest \nwhich does not and will not ex

In [144]:
sentences = nltk.sent_tokenize(chunk_text)

In [145]:
# Initialize Sentence Transformer for cosine similarity
similarity_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

In [146]:
# Compute embeddings for the LLM response and sentences
llm_response_embedding = similarity_model.encode(llm_response, convert_to_tensor=True)
sentence_embeddings = similarity_model.encode(sentences, convert_to_tensor=True)

In [147]:
# Compute cosine similarities
similarities = util.cos_sim(llm_response_embedding, sentence_embeddings)[0].tolist()

In [149]:
# Pair sentences with their similarity scores
sentences_with_scores = list(zip(sentences, similarities))

In [150]:
# Sort sentences by similarity score in descending order
sorted_sentences_with_scores = sorted(sentences_with_scores, key=lambda x: x[1], reverse=True)

In [154]:
for sentence, score in sorted_sentences_with_scores:
    print(f"Score: {score:.4f}, Sentence: {sentence}")
    print("#"*20)

Score: 0.9782, Sentence: Any balloon payment for the Third Party Loan must be clearly identified and disclosed to 
SBA and approved at application or subsequently approved by SBA.
####################
Score: 0.9185, Sentence: Any balloon payment for the Third Party Loan must be clearly 
SBA and approved at application or subsequently approved by SBA.
####################
Score: 0.7432, Sentence: Any balloon payment for the Third Party Loan must be clearly identified and disclosed to  overall loan maturity must be calculated, taking into account the amounts and maturities of each 
loan.
####################
Score: 0.6368, Sentence: The Third Party Loan has a reasonable interest 
which does not and will not exceed the maximum interest rate for Third Party Loans from 
commercial financial institutions as published periodically by SBA in the Federal Register
####################
Score: 0.4901, Sentence: The Third Party Loan has a reasonable interest rate 
and will not exceed the maximum in

In [157]:
# Find high-similarity sentences and their coordinates
high_similarity_sentences = []
for sentence, score in sorted_sentences_with_scores:
    if score >= COSINE_SIMILARITY_THRESHOLD:
        search_results = page.search_for(sentence)
        if search_results:  # Ensure search_results is not empty
            high_similarity_sentences.append(fitz.Rect(*search_results[0]))

In [158]:
high_similarity_sentences

[Rect(135.36038208007812, 297.96624755859375, 532.0783081054688, 312.693603515625)]

In [163]:
# Highlight the entire chunk in yellow
for coord in coordinates:
    rect = fitz.Rect(*coord)
    page.add_highlight_annot(rect)

In [167]:
# Highlight high-similarity sentences in green
for rect in high_similarity_sentences:
    highlight = page.add_highlight_annot(rect)
    highlight.set_colors(stroke=(0, 1, 0))  # Green color
    highlight.update()

In [169]:
doc.save(highlighted_pdf_path, garbage=4)
doc.close()