In [24]:
import os
import shutil
import json
import requests
import fitz
import nltk
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from sentence_transformers import SentenceTransformer, CrossEncoder, util
from chromadb.config import Settings
from langchain.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
from transformers import AutoTokenizer
import numpy as np

# Environment setup

In [25]:
def setup_environment():
    import sys
    sys.path.append('C:\\gitworkspace\\aimldemo\\jupyterworkapce')
    import stratup_env_setup
    stratup_env_setup.set_env()

In [26]:
setup_environment()

In [27]:
class HuggingFaceEmbeddings:
    """Custom wrapper for Hugging Face SentenceTransformer embeddings."""
    def __init__(self, model_name="sentence-transformers/all-MiniLM-L6-v2"):
        self.embedding_model = SentenceTransformer(model_name)
    
    def embed_documents(self, texts):
        """Generate embeddings for a list of documents."""
        return self.embedding_model.encode(texts, convert_to_tensor=False).tolist()
    
    def embed_query(self, text):
        """Generate embedding for a single query."""
        return self.embedding_model.encode(text, convert_to_tensor=False).tolist()

In [28]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jimmy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [29]:
# Load tokenizer for all-MiniLM-L6-v2
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

In [30]:
#chunk_size=500
chunk_size=5000
chunk_overlap=50
max_token_limit_for_pooling=250

In [31]:
pdf_path = "C:\\gitworkspace\\aimldemo\\AIML Demo\\Python Demo\\pythonbasics\\doc_inputs\\Form_2287.pdf"
output_folder = "C:\\gitworkspace\\aimldemo\\AIML Demo\\Python Demo\\pythonbasics\\doc_outputs"

In [32]:
highlighted_pdf_path = os.path.join(output_folder, "highlighted_output.pdf")

In [33]:
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.7)

# Vectorize PDF Document

In [34]:
db_path = "chroma_db"

In [35]:
if os.path.exists(db_path):
        print(f"[INFO] Deleting existing Chroma database at {db_path} to avoid conflicts.")
        shutil.rmtree(db_path, ignore_errors=True)

In [36]:
embeddings = HuggingFaceEmbeddings()

In [37]:
chroma_settings = Settings(persist_directory=db_path, anonymized_telemetry=False)

In [38]:
vector_db = Chroma(persist_directory=db_path, embedding_function=embeddings, client_settings=chroma_settings)

In [39]:
def compute_pooled_embedding(text_chunks, model, pooling_strategy="mean", weighting_strategy="equal"):
    """Computes pooled embedding for token-limited text chunks"""
    embeddings = model.encode(text_chunks, convert_to_numpy=True)

    if weighting_strategy == "equal":
        weights = np.ones(len(text_chunks)) / len(text_chunks)  # Equal weighting
    else:
        raise ValueError("Unsupported weighting strategy")

    if pooling_strategy == "mean":
        pooled_embedding = np.average(embeddings, axis=0, weights=weights)
    else:
        raise ValueError("Unsupported pooling strategy")

    return pooled_embedding.tolist()

In [40]:
# Function to split text into token-based chunks
def split_text_by_tokens(text, max_tokens):
    """Splits a text into chunks of max_tokens tokens using the model's tokenizer."""
    tokens = tokenizer.encode(text, add_special_tokens=False)
    chunks = []
    start = 0
    while start < len(tokens):
        end = min(start + max_tokens, len(tokens))
        chunk_text = tokenizer.decode(tokens[start:end])
        chunks.append(chunk_text)
        start += max_tokens  # No overlap in token-based splitting
    return chunks

In [41]:
def extract_chunks_with_coordinates(pdf_path):
    """Extract chunks from the PDF along with their coordinates (bounding boxes)."""
    doc = fitz.open(pdf_path)
    chunks_with_metadata = []
       

    for page_num in range(len(doc)):
        page = doc[page_num]
        page_text = page.get_text("blocks")
        full_text = " ".join(block[4] for block in page_text)

        splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        text_chunks = splitter.split_text(full_text)
        
        print("#"*50)
        print(f"For page no: {page_num+1} Number of chunks: {len(text_chunks)}")
        # Print token count for each chunk
        for i, chunk in enumerate(text_chunks):
            token_count = len(tokenizer.encode(chunk, add_special_tokens=True))
            print(f"Chunk {i+1}: {token_count} tokens")
            
            # If chunk exceeds max_token_limit_for_pooling, split and pool embeddings
            if token_count > max_token_limit_for_pooling:
                sub_chunks = split_text_by_tokens(chunk, max_token_limit_for_pooling)
                pooled_embedding = compute_pooled_embedding(sub_chunks, embeddings.embedding_model)
                final_chunk_text = " ".join(sub_chunks)  # Merge sub-chunks
            else:
                pooled_embedding = embeddings.embed_query(chunk)
                final_chunk_text = chunk  # Keep original chunk
                
            # Extract coordinates
            coordinates = []
            search_results = page.search_for(chunk)
            for rect in search_results:
                coordinates.append((rect.x0, rect.y0, rect.x1, rect.y1))

            # Store chunk with metadata
            chunks_with_metadata.append({
                "text": final_chunk_text,
                "page": page_num,
                "coordinates": coordinates,
                "embedding": pooled_embedding
            })

    return chunks_with_metadata

In [42]:
text_chunks_with_metadata = extract_chunks_with_coordinates(pdf_path)

Token indices sequence length is longer than the specified maximum sequence length for this model (983 > 512). Running this sequence through the model will result in indexing errors


##################################################
For page no: 1 Number of chunks: 1
Chunk 1: 983 tokens
##################################################
For page no: 2 Number of chunks: 1
Chunk 1: 787 tokens
##################################################
For page no: 3 Number of chunks: 1
Chunk 1: 776 tokens
##################################################
For page no: 4 Number of chunks: 1
Chunk 1: 770 tokens
##################################################
For page no: 5 Number of chunks: 1
Chunk 1: 876 tokens
##################################################
For page no: 6 Number of chunks: 1
Chunk 1: 608 tokens


In [43]:
text_chunks_with_metadata[0]

{'text': 'sba form 2287 ( 04 - 18 ) 1 previous editions obsolete third party lender agreement this third party lender agreement ( “ agreement " ) is dated this _ _ _ _ _ _ day of _ _ _ _ _ _ _ _ _ _ _ _ _ _, 20 _ _ _ _, by and between _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _, ( “ third party lender ” ) whose address is _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _, and _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _, ( “ cdc ” ) whose address is _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _. recitals 1. the third party lender and cdc will provide separate loans to the borrower and operating company, if any ( collectively “ borrower ” ), according to the terms in the authorization for debenture guarantee ( sba 504 loan ), as amended ( “ authorization ” ). the th

In [44]:
chunk_texts = [item["text"] for item in text_chunks_with_metadata]
metadata = [{"page": item["page"], "coordinates": json.dumps(item["coordinates"])} for item in text_chunks_with_metadata]
embeddings_list = [item["embedding"] for item in text_chunks_with_metadata]

In [105]:
chunk_texts[1]

'sba form 2287 ( 04 - 18 ) 2 previous editions obsolete terms and conditions in consideration of the above, the mutual agreements set forth below, and for other good and valuable consideration, the receipt and sufficiency of which are hereby acknowledged, the parties agree as follows : 1. amount of third party loan. the third party lender represents that the third party loan is fully advanced ; does not exceed the amount stated in the authorization ; and, will not exceed the amount allowed by the authorization, plus reasonable costs of collection, maintenance, and protection of the third party lender lien. any amounts owed by borrower to lender in excess of the third party lender lien amount stated in the authorization cannot be secured by a lien on the common collateral unless it is subordinate to the 504 loan. 2. subordination of 504 loan. cdc agrees to make the 504 loan to the borrower, subject to sba ’ s approval, and accept a junior and subordinate lien position in the common coll

In [111]:
chunk_tokens = tokenizer.tokenize(chunk_texts[1])

In [115]:
type(chunk_tokens)

list

In [116]:
len(chunk_tokens)

785

In [117]:
chunk_tokens

['sb',
 '##a',
 'form',
 '228',
 '##7',
 '(',
 '04',
 '-',
 '18',
 ')',
 '2',
 'previous',
 'editions',
 'obsolete',
 'terms',
 'and',
 'conditions',
 'in',
 'consideration',
 'of',
 'the',
 'above',
 ',',
 'the',
 'mutual',
 'agreements',
 'set',
 'forth',
 'below',
 ',',
 'and',
 'for',
 'other',
 'good',
 'and',
 'valuable',
 'consideration',
 ',',
 'the',
 'receipt',
 'and',
 'su',
 '##ffi',
 '##ciency',
 'of',
 'which',
 'are',
 'here',
 '##by',
 'acknowledged',
 ',',
 'the',
 'parties',
 'agree',
 'as',
 'follows',
 ':',
 '1',
 '.',
 'amount',
 'of',
 'third',
 'party',
 'loan',
 '.',
 'the',
 'third',
 'party',
 'lend',
 '##er',
 'represents',
 'that',
 'the',
 'third',
 'party',
 'loan',
 'is',
 'fully',
 'advanced',
 ';',
 'does',
 'not',
 'exceed',
 'the',
 'amount',
 'stated',
 'in',
 'the',
 'authorization',
 ';',
 'and',
 ',',
 'will',
 'not',
 'exceed',
 'the',
 'amount',
 'allowed',
 'by',
 'the',
 'authorization',
 ',',
 'plus',
 'reasonable',
 'costs',
 'of',
 'collect

In [118]:
metadata[0]

{'page': 0,
 'coordinates': '[[74.28336334228516, 736.0769653320312, 161.34852600097656, 747.1239013671875], [287.99462890625, 736.0769653320312, 290.229736328125, 747.1239013671875], [395.9959411621094, 736.0769653320312, 424.5137939453125, 747.1239013671875], [72.0, 745.435546875, 172.51608276367188, 756.4824829101562], [141.0, 82.16398620605469, 144.0, 98.17198944091797], [177.0, 82.16398620605469, 180.0, 98.17198944091797], [138.72000122070312, 107.96397399902344, 141.72000122070312, 123.97197723388672], [72.0, 133.7639617919922, 75.0, 149.77195739746094], [306.0, 159.56394958496094, 309.0, 175.5719451904297], [155.0399932861328, 184.8782196044922, 168.40667724609375, 212.6336212158203], [168.36000061035156, 189.1377410888672, 214.11094665527344, 211.24234008789062], [214.0800018310547, 184.8782196044922, 226.32444763183594, 212.6336212158203], [226.32000732421875, 189.1377410888672, 275.67633056640625, 211.24234008789062], [275.760009765625, 184.8782196044922, 289.1266784667969, 2

In [47]:
vector_db.add_texts(chunk_texts, metadatas=metadata, embeddings=embeddings_list)

['197a6489-3f86-4fa5-9d07-ed83896a04b2',
 '6526039d-e02b-49e8-95dd-1c3c002b9a8c',
 'c3aac443-8159-4f12-a2bd-8706b23d8f03',
 'ddc900aa-2090-4047-b473-46168ff7ead5',
 'a61add84-6ea6-4ba5-afaa-94aa1022c5cf',
 '32f62325-af93-4852-8d32-610cb9d0ee94']

In [48]:
vector_db.persist()

  vector_db.persist()


# Chat Query

In [49]:
#user_query = 'What does this document say about Balloon payments?'
#user_query = "what does this document say about authority to execute agreement?"
user_query = 'About Balloon payments?'

In [50]:
chat_history = [
    {"role": "user", "content": "CDC agrees to make the 504 Loan to the Borrower subject to what?"},
    {"role": "assistant", "content": "SBA's approval."},
    {"role": "user", "content": "Third Party Lender waives its rights for what?"},
    {"role": "assistant", "content": "Third Party Lender waives its rights to enforce provisions that do not comply with the 504 Loan Program Requirements."},
    {"role": "user", "content": "what does this document say about authority to execute agreement?"},
    {"role": "assistant", "content": "The persons signing below certify that they have been duly authorized to execute this Agreement on behalf of their respective party."}
]

* Rephrase the query to identify teh user's intended full question using LLM

In [51]:
template = """
Given the chat history and the current user query, rewrite the query so it forms a complete question, maintaining the context of the conversation. The rewritten query should follow the style and intent of the previous user queries.

Chat History:
{chat_history}

Current user query: {query}

Rewritten query:
"""

In [52]:
# Format the chat history into a string
formatted_chat_history = "\n".join([f'{entry["role"].capitalize()}: {entry["content"]}' for entry in chat_history])


In [53]:
# Create the prompt for the LLM
prompt = PromptTemplate.from_template(template).format(
    chat_history=formatted_chat_history,
    query=user_query
)

In [54]:
prompt

"\nGiven the chat history and the current user query, rewrite the query so it forms a complete question, maintaining the context of the conversation. The rewritten query should follow the style and intent of the previous user queries.\n\nChat History:\nUser: CDC agrees to make the 504 Loan to the Borrower subject to what?\nAssistant: SBA's approval.\nUser: Third Party Lender waives its rights for what?\nAssistant: Third Party Lender waives its rights to enforce provisions that do not comply with the 504 Loan Program Requirements.\nUser: what does this document say about authority to execute agreement?\nAssistant: The persons signing below certify that they have been duly authorized to execute this Agreement on behalf of their respective party.\n\nCurrent user query: About Balloon payments?\n\nRewritten query:\n"

In [55]:
# Get the response from the LLM
rewritten_query = llm(prompt)

  rewritten_query = llm(prompt)


In [56]:
rewritten_query

AIMessage(content='What does the document say about Balloon payments?', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 11, 'prompt_tokens': 166, 'total_tokens': 177, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-89fb00f9-a29f-4033-8cfc-05d1ab794b4a-0', usage_metadata={'input_tokens': 166, 'output_tokens': 11, 'total_tokens': 177, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}})

In [57]:
query = rewritten_query.content

In [58]:
# reaphrased query to be searched in vector DB
query

'What does the document say about Balloon payments?'

# Fetch relevant chunks from Vector DB

In [59]:
top_chunks_with_scores = vector_db.similarity_search_with_score(query, k=3)

In [60]:
top_chunks_with_scores

[(Document(metadata={'coordinates': '[[74.28336334228516, 736.0769653320312, 161.34852600097656, 747.1239013671875], [287.99462890625, 736.0769653320312, 290.229736328125, 747.1239013671875], [395.9959411621094, 736.0769653320312, 424.5137939453125, 747.1239013671875], [72.0, 745.435546875, 172.51608276367188, 756.4824829101562], [108.0, 70.33489990234375, 531.4722900390625, 85.06226348876953], [107.98892211914062, 82.9315185546875, 530.2137451171875, 97.65888214111328], [107.98892974853516, 95.64959716796875, 110.74893188476562, 110.37696075439453], [108.0, 108.2548828125, 110.76000213623047, 122.98224639892578], [144.00143432617188, 108.2548828125, 154.08094787597656, 122.98224639892578], [180.00286865234375, 108.2548828125, 540.229248046875, 122.98224639892578], [108.0, 120.974853515625, 523.3866577148438, 135.70220947265625], [107.98895263671875, 133.57147216796875, 527.734130859375, 148.298828125], [107.98892211914062, 146.1680908203125, 520.8927001953125, 160.89544677734375], [10

In [61]:
# Extract chunk texts for cross-encoding
top_chunks = [chunk.page_content for chunk, _ in top_chunks_with_scores]

In [62]:
cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

In [63]:
query_chunk_pairs = [(query, chunk) for chunk in top_chunks]

In [64]:
query_chunk_pairs

[('What does the document say about Balloon payments?',
  'sba form 2287 ( 04 - 18 ) 3 previous editions obsolete failure to pay taxes when due or violation of any financial covenants which would cause a prudent lender to believe that the prospect of payment or performance of the third party note is impaired. c. no cross - collateralization. third party lender agrees that the common collateral will only secure its third party loan and the common collateral is not currently, and will not be used in the future, as security for any other financing provided by third party lender to borrower that purports to be in a superior position to that of the cdc lien, unless authorized in writing by cdc and sba. d. no cross - default. during the term of the 504 loan, third party lender will not exercise any cross - default, " deem at - risk, " or any other provisions in documents evidencing the third party loan or third party lender lien which allow third party lender to make demand on the third part

In [65]:
cross_encoded_scores = cross_encoder.predict(query_chunk_pairs)

In [66]:
# Pair chunks with their cross-encoder scores and sort them in descending order
ranked_chunks_with_scores = sorted(
    zip(top_chunks_with_scores, cross_encoded_scores),
    key=lambda x: x[1],  # Sort by score (descending)
    reverse=True
)

In [67]:
top_chunk_cross_encoded, top_cross_encoded_score = ranked_chunks_with_scores[0][0]

In [68]:
metadata_of_top_chunk = top_chunk_cross_encoded.metadata

In [69]:
top_chunk_cross_encoded

Document(metadata={'coordinates': '[[74.28336334228516, 736.0769653320312, 161.34852600097656, 747.1239013671875], [287.99462890625, 736.0769653320312, 290.229736328125, 747.1239013671875], [395.9959411621094, 736.0769653320312, 424.5137939453125, 747.1239013671875], [72.0, 745.435546875, 172.51608276367188, 756.4824829101562], [108.0, 70.33489990234375, 531.4722900390625, 85.06226348876953], [107.98892211914062, 82.9315185546875, 530.2137451171875, 97.65888214111328], [107.98892974853516, 95.64959716796875, 110.74893188476562, 110.37696075439453], [108.0, 108.2548828125, 110.76000213623047, 122.98224639892578], [144.00143432617188, 108.2548828125, 154.08094787597656, 122.98224639892578], [180.00286865234375, 108.2548828125, 540.229248046875, 122.98224639892578], [108.0, 120.974853515625, 523.3866577148438, 135.70220947265625], [107.98895263671875, 133.57147216796875, 527.734130859375, 148.298828125], [107.98892211914062, 146.1680908203125, 520.8927001953125, 160.89544677734375], [107.

In [70]:
metadata_of_top_chunk

{'coordinates': '[[74.28336334228516, 736.0769653320312, 161.34852600097656, 747.1239013671875], [287.99462890625, 736.0769653320312, 290.229736328125, 747.1239013671875], [395.9959411621094, 736.0769653320312, 424.5137939453125, 747.1239013671875], [72.0, 745.435546875, 172.51608276367188, 756.4824829101562], [108.0, 70.33489990234375, 531.4722900390625, 85.06226348876953], [107.98892211914062, 82.9315185546875, 530.2137451171875, 97.65888214111328], [107.98892974853516, 95.64959716796875, 110.74893188476562, 110.37696075439453], [108.0, 108.2548828125, 110.76000213623047, 122.98224639892578], [144.00143432617188, 108.2548828125, 154.08094787597656, 122.98224639892578], [180.00286865234375, 108.2548828125, 540.229248046875, 122.98224639892578], [108.0, 120.974853515625, 523.3866577148438, 135.70220947265625], [107.98895263671875, 133.57147216796875, 527.734130859375, 148.298828125], [107.98892211914062, 146.1680908203125, 520.8927001953125, 160.89544677734375], [107.98892211914062, 15

# LLM query

In [71]:
llm_instruction = "Please answer the question by carefully considering both the provided context and the chat history. Use the context for accurate information and take into account any relevant details from the chat history to generate a well-informed response. If enough information is not available, please respond by saying: 'There is not enough information to answer the question.'"

In [72]:
# Create the prompt
template = """
Context: 
{context}

Chat history:
{chat_history}

Instruction: 
{llm_instruction}

User question: {question}
"""

In [73]:
formatted_prompt = PromptTemplate.from_template(template).format(
    context=top_chunk_cross_encoded.page_content,
    chat_history="\n".join([f'{entry["role"].capitalize()}: {entry["content"]}' for entry in chat_history]),
    question=query,
    llm_instruction=llm_instruction
)

In [74]:
formatted_prompt

'\nContext: \nsba form 2287 ( 04 - 18 ) 3 previous editions obsolete failure to pay taxes when due or violation of any financial covenants which would cause a prudent lender to believe that the prospect of payment or performance of the third party note is impaired. c. no cross - collateralization. third party lender agrees that the common collateral will only secure its third party loan and the common collateral is not currently, and will not be used in the future, as security for any other financing provided by third party lender to borrower that purports to be in a superior position to that of the cdc lien, unless authorized in writing by cdc and sba. d. no cross - default. during the term of the 504 loan, third party lender will not exercise any cross - default, " deem at - risk, " or any other provisions in documents evidencing the third party loan or third party lender lien which allow third party lender to make demand on the third party loan prior to maturity unless the third par

In [75]:
# Get response from LLM
llm_response = llm.predict(formatted_prompt)

  llm_response = llm.predict(formatted_prompt)


In [76]:
print("#"*50)
print(llm_response)

##################################################
The document states that any balloon payment for the third party loan must be clearly identified and disclosed to SBA and approved at application or subsequently approved by SBA.


# PDF Highlighting

In [77]:
# Configurable cosine similarity threshold
COSINE_SIMILARITY_THRESHOLD = 0.5

In [78]:
page_num =  metadata_of_top_chunk["page"]

In [79]:
page_num

2

In [80]:
coordinates_str = metadata_of_top_chunk["coordinates"]

In [81]:
coordinates_str

'[[74.28336334228516, 736.0769653320312, 161.34852600097656, 747.1239013671875], [287.99462890625, 736.0769653320312, 290.229736328125, 747.1239013671875], [395.9959411621094, 736.0769653320312, 424.5137939453125, 747.1239013671875], [72.0, 745.435546875, 172.51608276367188, 756.4824829101562], [108.0, 70.33489990234375, 531.4722900390625, 85.06226348876953], [107.98892211914062, 82.9315185546875, 530.2137451171875, 97.65888214111328], [107.98892974853516, 95.64959716796875, 110.74893188476562, 110.37696075439453], [108.0, 108.2548828125, 110.76000213623047, 122.98224639892578], [144.00143432617188, 108.2548828125, 154.08094787597656, 122.98224639892578], [180.00286865234375, 108.2548828125, 540.229248046875, 122.98224639892578], [108.0, 120.974853515625, 523.3866577148438, 135.70220947265625], [107.98895263671875, 133.57147216796875, 527.734130859375, 148.298828125], [107.98892211914062, 146.1680908203125, 520.8927001953125, 160.89544677734375], [107.98892211914062, 158.88616943359375

In [82]:
coordinates = json.loads(coordinates_str)

In [83]:
coordinates

[[74.28336334228516, 736.0769653320312, 161.34852600097656, 747.1239013671875],
 [287.99462890625, 736.0769653320312, 290.229736328125, 747.1239013671875],
 [395.9959411621094, 736.0769653320312, 424.5137939453125, 747.1239013671875],
 [72.0, 745.435546875, 172.51608276367188, 756.4824829101562],
 [108.0, 70.33489990234375, 531.4722900390625, 85.06226348876953],
 [107.98892211914062, 82.9315185546875, 530.2137451171875, 97.65888214111328],
 [107.98892974853516,
  95.64959716796875,
  110.74893188476562,
  110.37696075439453],
 [108.0, 108.2548828125, 110.76000213623047, 122.98224639892578],
 [144.00143432617188, 108.2548828125, 154.08094787597656, 122.98224639892578],
 [180.00286865234375, 108.2548828125, 540.229248046875, 122.98224639892578],
 [108.0, 120.974853515625, 523.3866577148438, 135.70220947265625],
 [107.98895263671875, 133.57147216796875, 527.734130859375, 148.298828125],
 [107.98892211914062,
  146.1680908203125,
  520.8927001953125,
  160.89544677734375],
 [107.9889221191

In [84]:
pdf_path

'C:\\gitworkspace\\aimldemo\\AIML Demo\\Python Demo\\pythonbasics\\doc_inputs\\Form_2287.pdf'

In [85]:
highlighted_pdf_path

'C:\\gitworkspace\\aimldemo\\AIML Demo\\Python Demo\\pythonbasics\\doc_outputs\\highlighted_output.pdf'

In [86]:
doc = fitz.open(pdf_path)

In [87]:
page = doc.load_page(page_num)

In [88]:
type(page) 

pymupdf.Page

In [89]:
page.get_text()

' \n \n \n \n SBA Form 2287 (04-18) \n \n          3 \n Previous Editions Obsolete \nfailure to pay taxes when due or violation of  any financial covenants which would cause a prudent \nlender to believe that the  prospect of payment or performance of the Third Party Note is impaired. \n \n \nc. \nNo Cross-Collateralization.   Third Party Lender agrees that the Common Collateral \nwill only secure its Third Party Loan and the Common Collateral is not currently, and will not be \nused in the future, as security for any other financing provided by Third Party Lender to Borrower \nthat purports to be in a superior position to that of the CDC Lien, unless authorized in writing by \nCDC and SBA. \n \n \nd. \nNo Cross-Default.   During the term of the 504 Loan, Third Party Lender will not \nexercise any cross-default, "deem at-risk," or any other provisions in documents evidencing the Third \nParty Loan or Third Party Lender Lien which allow Third Party Lender to make demand on the \nThird P

In [90]:
chunk_text = " ".join(page.get_textbox(fitz.Rect(*coord)) for coord in coordinates)

In [91]:
chunk_text

'SBA Form 2287 (04-18) \nPrevious Editions Obsole             3   SBA Form 2287 (04-18) \n Previous Editions Obsolete  failure to pay taxes when due or violation of  any financial covenants which would cause a prudent \nlender to believe that the  prospect of payment or performance of the Third Party Note is impaired.  failure to pay taxes when due or violation of  any financial covenants which would cause a prudent \nlender to believe that the  prospect of payment or performance of the Third Party Note is impaired. \n  l\n \n   \n \nw c. \ny se No Cross-Collateralization.   Third Party Lender agrees that the Common Collateral \nits Third Party Loan and the Common Collateral is not currently, and will not be   \nc. \nNo Cross-Collateralization.   Third Party Lender agrees that the Common Collate\nwill only secure its Third Party Loan and the Common Collateral is not currently, and will not be \nused in the future, as security for any other financing provided by Third Party Lender to Bo

In [92]:
sentences = nltk.sent_tokenize(chunk_text)

In [93]:
# Initialize Sentence Transformer for cosine similarity
similarity_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

In [94]:
# Compute embeddings for the LLM response and sentences
llm_response_embedding = similarity_model.encode(llm_response, convert_to_tensor=True)
sentence_embeddings = similarity_model.encode(sentences, convert_to_tensor=True)

In [95]:
# Compute cosine similarities
similarities = util.cos_sim(llm_response_embedding, sentence_embeddings)[0].tolist()

In [96]:
# Pair sentences with their similarity scores
sentences_with_scores = list(zip(sentences, similarities))

In [97]:
# Sort sentences by similarity score in descending order
sorted_sentences_with_scores = sorted(sentences_with_scores, key=lambda x: x[1], reverse=True)

In [98]:
for sentence, score in sorted_sentences_with_scores:
    print(f"Score: {score:.4f}, Sentence: {sentence}")
    print("#"*20)

Score: 0.9778, Sentence: Any balloon payment for the Third Party Loan must be clearly identified and disclosed to 
SBA and approved at application or subsequently approved by SBA.
####################
Score: 0.9087, Sentence: Any balloon payment for the Third Party Loan must be clearly 
SBA and approved at application or subsequently approved by SBA.
####################
Score: 0.7380, Sentence: Any balloon payment for the Third Party Loan must be clearly identified and disclosed to  overall loan maturity must be calculated, taking into account the amounts and maturities of each 
loan.
####################
Score: 0.6104, Sentence: The Third Party Loan has a reasonable interest r
which does not and will not exceed the maximum interest rate for Third Party Loans from 
commercial financial institutions as published periodically by SBA in the Federal Register  which does not and will not exceed the maximum interest rate for Third Party Loans from 
commercial financial institutions as publi

In [99]:
# Find high-similarity sentences and their coordinates
high_similarity_sentences = []
for sentence, score in sorted_sentences_with_scores:
    if score >= COSINE_SIMILARITY_THRESHOLD:
        search_results = page.search_for(sentence)
        if search_results:  # Ensure search_results is not empty
            high_similarity_sentences.append(fitz.Rect(*search_results[0]))

In [100]:
high_similarity_sentences

[Rect(135.36038208007812, 297.96624755859375, 532.0783081054688, 312.693603515625),
 Rect(143.62855529785156, 525.7296142578125, 529.6664428710938, 540.4569702148438),
 Rect(259.4261169433594, 437.16961669921875, 500.5091552734375, 451.89697265625),
 Rect(107.99040222167969, 487.69488525390625, 303.70196533203125, 502.4222412109375)]

In [101]:
# Highlight the entire chunk in yellow
for coord in coordinates:
    rect = fitz.Rect(*coord)
    page.add_highlight_annot(rect)

In [102]:
# Highlight high-similarity sentences in green
for rect in high_similarity_sentences:
    highlight = page.add_highlight_annot(rect)
    highlight.set_colors(stroke=(0, 1, 0))  # Green color
    highlight.update()

In [103]:
doc.save(highlighted_pdf_path, garbage=4)
doc.close()