In [117]:
import os
import shutil
import json
import requests
import fitz
import nltk
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from sentence_transformers import SentenceTransformer, CrossEncoder, util
from chromadb.config import Settings
from langchain.prompts import PromptTemplate
from langchain_openai import ChatOpenAI

# Environment setup

In [118]:
def setup_environment():
    import sys
    sys.path.append('C:\\gitworkspace\\aimldemo\\jupyterworkapce')
    import stratup_env_setup
    stratup_env_setup.set_env()

In [119]:
setup_environment()

In [120]:
class HuggingFaceEmbeddings:
    """Custom wrapper for Hugging Face SentenceTransformer embeddings."""
    def __init__(self, model_name="sentence-transformers/all-MiniLM-L6-v2"):
        self.embedding_model = SentenceTransformer(model_name)
    
    def embed_documents(self, texts):
        """Generate embeddings for a list of documents."""
        return self.embedding_model.encode(texts, convert_to_tensor=False).tolist()
    
    def embed_query(self, text):
        """Generate embedding for a single query."""
        return self.embedding_model.encode(text, convert_to_tensor=False).tolist()

In [121]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jimmy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [122]:
chunk_size=500
chunk_overlap=50

In [123]:
#pdf_path = "C:\\gitworkspace\\aimldemo\\AIML Demo\\Python Demo\\pythonbasics\\doc_inputs\\Form_2287.pdf"
pdf_folder = "C:\\gitworkspace\\aimldemo\\AIML Demo\\Python Demo\\pythonbasics\\doc_inputs"
output_folder = "C:\\gitworkspace\\aimldemo\\AIML Demo\\Python Demo\\pythonbasics\\doc_outputs"

In [124]:
highlighted_pdf_path = os.path.join(output_folder, "highlighted_output.pdf")

In [125]:
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.7)

# Vectorize PDF Document

In [126]:
db_path = "chroma_db"

In [127]:
if os.path.exists(db_path):
        print(f"[INFO] Deleting existing Chroma database at {db_path} to avoid conflicts.")
        shutil.rmtree(db_path, ignore_errors=True)

In [128]:
embeddings = HuggingFaceEmbeddings()

In [129]:
chroma_settings = Settings(persist_directory=db_path, anonymized_telemetry=False)

In [130]:
vector_db = Chroma(persist_directory=db_path, embedding_function=embeddings, client_settings=chroma_settings)

In [131]:
def extract_chunks_with_coordinates(pdf_path):
    """Extract chunks from the PDF along with their coordinates (bounding boxes) and file name."""
    doc = fitz.open(pdf_path)
    chunks_with_metadata = []

    # Extract the file name from the full path
    file_name = os.path.basename(pdf_path)

    for page_num in range(len(doc)):
        page = doc[page_num]
        page_text = page.get_text("blocks")
        full_text = " ".join(block[4] for block in page_text)

        splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        text_chunks = splitter.split_text(full_text)

        for chunk in text_chunks:
            coordinates = []
            search_results = page.search_for(chunk)
            for rect in search_results:
                coordinates.append((rect.x0, rect.y0, rect.x1, rect.y1))

            chunks_with_metadata.append({
                "text": chunk,
                "page": page_num,
                "coordinates": coordinates,
                "file_name": file_name  # Add file name to metadata
            })

    return chunks_with_metadata


In [132]:
pdf_files = [os.path.join(pdf_folder, f) for f in os.listdir(pdf_folder) if f.endswith(".pdf")]

In [133]:
pdf_files

['C:\\gitworkspace\\aimldemo\\AIML Demo\\Python Demo\\pythonbasics\\doc_inputs\\Form_2287.pdf',
 'C:\\gitworkspace\\aimldemo\\AIML Demo\\Python Demo\\pythonbasics\\doc_inputs\\lenovo_license_agreement.pdf']

In [134]:
text_chunks_with_metadata = []
for pdf_file in pdf_files:
    chunks = extract_chunks_with_coordinates(pdf_file)
    text_chunks_with_metadata.extend(chunks)

In [135]:
text_chunks_with_metadata[0]

{'text': 'SBA Form 2287 (04-18) \n \n          1 \n Previous Editions Obsolete \n  \n \n  \n  \n  \n THIRD PARTY LENDER AGREEMENT \n  \n \n \nTHIS THIRD PARTY LENDER AGREEMENT (“Agreement") is dated this ______ day \nof______________, 20____, by and between ________________________________________, \n(“Third Party Lender”) whose address is ____________________________________________, \nand________________________________________________________, (“CDC”) whose address is',
 'page': 0,
 'coordinates': [(74.28336334228516,
   736.0769653320312,
   161.34852600097656,
   747.1239013671875),
  (287.99462890625, 736.0769653320312, 290.229736328125, 747.1239013671875),
  (395.9959411621094, 736.0769653320312, 424.5137939453125, 747.1239013671875),
  (72.0, 745.435546875, 172.51608276367188, 756.4824829101562),
  (141.0, 82.16398620605469, 144.0, 98.17198944091797),
  (177.0, 82.16398620605469, 180.0, 98.17198944091797),
  (138.72000122070312,
   107.96397399902344,
   141.72000122070312,
   

In [136]:
page_number = 2
file_name = 'Form_2287.pdf'

In [140]:
for chunk in text_chunks_with_metadata:
    if chunk['file_name'] == file_name and chunk['page'] == page_number:
        print("Text Chunk:", chunk['text'])
        print("Page Number:", chunk['page'])
        print("File Name:", chunk['file_name'])
        print("#" * 50)

Text Chunk: SBA Form 2287 (04-18) 
 
          3 
 Previous Editions Obsolete 
 failure to pay taxes when due or violation of  any financial covenants which would cause a prudent 
lender to believe that the  prospect of payment or performance of the Third Party Note is impaired. 
 
 
c. 
No Cross-Collateralization.   Third Party Lender agrees that the Common Collateral 
 will only secure its Third Party Loan and the Common Collateral is not currently, and will not be
Page Number: 2
File Name: Form_2287.pdf
##################################################
Text Chunk: used in the future, as security for any other financing provided by Third Party Lender to Borrower 
that purports to be in a superior position to that of the CDC Lien, unless authorized in writing by 
CDC and SBA. 
 
 
d. 
No Cross-Default.   During the term of the 504 Loan, Third Party Lender will not 
 exercise any cross-default, "deem at-risk," or any other provisions in documents evidencing the Third
Page Number: 2
Fi

In [141]:
chunk_texts = [item["text"] for item in text_chunks_with_metadata]
metadata = [{"page_number": item["page"], "file_name": item["file_name"], "coordinates": json.dumps(item["coordinates"])} for item in text_chunks_with_metadata]

In [142]:
chunk_texts[0]

'SBA Form 2287 (04-18) \n \n          1 \n Previous Editions Obsolete \n  \n \n  \n  \n  \n THIRD PARTY LENDER AGREEMENT \n  \n \n \nTHIS THIRD PARTY LENDER AGREEMENT (“Agreement") is dated this ______ day \nof______________, 20____, by and between ________________________________________, \n(“Third Party Lender”) whose address is ____________________________________________, \nand________________________________________________________, (“CDC”) whose address is'

In [143]:
metadata[0]

{'page_number': 0,
 'file_name': 'Form_2287.pdf',
 'coordinates': '[[74.28336334228516, 736.0769653320312, 161.34852600097656, 747.1239013671875], [287.99462890625, 736.0769653320312, 290.229736328125, 747.1239013671875], [395.9959411621094, 736.0769653320312, 424.5137939453125, 747.1239013671875], [72.0, 745.435546875, 172.51608276367188, 756.4824829101562], [141.0, 82.16398620605469, 144.0, 98.17198944091797], [177.0, 82.16398620605469, 180.0, 98.17198944091797], [138.72000122070312, 107.96397399902344, 141.72000122070312, 123.97197723388672], [72.0, 133.7639617919922, 75.0, 149.77195739746094], [306.0, 159.56394958496094, 309.0, 175.5719451904297], [155.0399932861328, 184.8782196044922, 168.40667724609375, 212.6336212158203], [168.36000061035156, 189.1377410888672, 214.11094665527344, 211.24234008789062], [214.0800018310547, 184.8782196044922, 226.32444763183594, 212.6336212158203], [226.32000732421875, 189.1377410888672, 275.67633056640625, 211.24234008789062], [275.760009765625, 1

In [144]:
vector_db.add_texts(chunk_texts, metadatas=metadata)

['8da7d0a4-5008-4603-bd77-260972ecdb35',
 '118fa21f-a658-4f1c-b3bb-53839990e1c1',
 '20fb948b-a488-4e0b-b62b-71687b010244',
 '601f8059-4b52-4407-8442-362aa50914b1',
 '77524a06-9fe5-4bdd-a41b-531269a4f703',
 'a976601d-99bd-4a56-9564-40b9d62e4be5',
 '02ffe311-d074-445f-a86e-92c196f533f7',
 '7aceb540-7380-4ee3-9323-2e43204e7a5c',
 '668f76e5-8e65-42a8-9755-1d682bbc8318',
 'b9ea370f-c5b1-498d-9abc-000dfdc245e2',
 '08e649ad-7a68-4827-a0d2-65a2a00231b7',
 'fe20b68b-0d10-4e3f-8cf0-3f7d12e97286',
 '5cd40958-fb4b-450e-a3da-4fc845600fb2',
 '4cb8a1a2-1a67-435a-8357-819920f17224',
 'e13606b5-fff4-471a-953b-edbafcf85b4e',
 'afc40f4d-a349-4cfb-be7f-aa767c9f49ce',
 'fce9739e-9d9b-4e6c-94a7-7b1b617ba754',
 'e662dacb-654b-4e26-a106-5b045392076a',
 '0f8ace1f-09cd-420b-84f1-51670d207a60',
 'bce3826f-65b7-4f72-8cba-8184b92cd5a8',
 'e20dc6c9-b5e4-4c88-be30-a1d32e332a16',
 '85aca721-d84c-4df3-8103-3df08487bedd',
 'df821263-f680-48f5-b2c3-8304ffca708d',
 'daa7d7ab-4ab2-4b2c-97a3-29edc911f4f0',
 'c6853ebf-3aa2-

In [145]:
vector_db.persist()

# Chat Query

In [241]:
#user_query = 'What does this document say about Balloon payments?'
#user_query = "what does this document say about authority to execute agreement?"
user_query = 'About Balloon payments in page_number 3 of Form_2287.pdf?'
user_query = 'What does the docuemnt say about Transferability in page_number 3 of lenovo_license_agreement.pdf?'

In [242]:
chat_history = [
    {"role": "user", "content": "CDC agrees to make the 504 Loan to the Borrower subject to what?"},
    {"role": "assistant", "content": "SBA's approval."},
    {"role": "user", "content": "Third Party Lender waives its rights for what?"},
    {"role": "assistant", "content": "Third Party Lender waives its rights to enforce provisions that do not comply with the 504 Loan Program Requirements."},
    {"role": "user", "content": "what does this document say about authority to execute agreement?"},
    {"role": "assistant", "content": "The persons signing below certify that they have been duly authorized to execute this Agreement on behalf of their respective party."}
]

* Rephrase the query to identify teh user's intended full question using LLM

In [243]:
# Template to rephrase the user query and extract metadata
template = """
Given the chat history and the current user query, rewrite the query so it forms a complete question, maintaining the context of the conversation. Then, extract metadata (only "file_name" and "page_number" fields) if they are mentioned in the question or chat history.
Rewritten query shoudl not have refernce to metadata fields file_name and page_number.

Chat History:
{chat_history}

Current user query: {query}

Rewritten query:
[Provide the rewritten query here.]

Extracted Metadata (JSON format with only file_name and page_number fields):
[Provide the metadata here as a valid JSON object.]
"""

In [244]:
# Format the chat history into a string
formatted_chat_history = "\n".join([f'{entry["role"].capitalize()}: {entry["content"]}' for entry in chat_history])

In [245]:
# Create the prompt for the LLM
prompt = PromptTemplate.from_template(template).format(
    chat_history=formatted_chat_history,
    query=user_query
)

In [246]:
prompt

'\nGiven the chat history and the current user query, rewrite the query so it forms a complete question, maintaining the context of the conversation. Then, extract metadata (only "file_name" and "page_number" fields) if they are mentioned in the question or chat history.\nRewritten query shoudl not have refernce to metadata fields file_name and page_number.\n\nChat History:\nUser: CDC agrees to make the 504 Loan to the Borrower subject to what?\nAssistant: SBA\'s approval.\nUser: Third Party Lender waives its rights for what?\nAssistant: Third Party Lender waives its rights to enforce provisions that do not comply with the 504 Loan Program Requirements.\nUser: what does this document say about authority to execute agreement?\nAssistant: The persons signing below certify that they have been duly authorized to execute this Agreement on behalf of their respective party.\n\nCurrent user query: What does the docuemnt say about Transferability in page_number 3 of lenovo_license_agreement.pdf

In [247]:
response = llm.predict(prompt)

In [248]:
response

'Rewritten query: What does the document say about Transferability in lenovo_license_agreement.pdf?\n\nExtracted Metadata:\n{\n  "file_name": "lenovo_license_agreement.pdf",\n  "page_number": 3\n}'

In [249]:
# Split the response into rewritten query and metadata parts
response_parts = response.split("\nExtracted Metadata:")

In [250]:
query = response_parts[0].replace("Rewritten query:", "").strip()

In [251]:
metadata_str = response_parts[1].strip()

In [252]:
metadata = json.loads(metadata_str)

In [253]:
metadata

{'file_name': 'lenovo_license_agreement.pdf', 'page_number': 3}

In [254]:
query

'What does the document say about Transferability in lenovo_license_agreement.pdf?'

# Fetch relevant chunks from Vector DB

In [255]:
 metadata_filter = {
        "$and": [
            {key: {"$eq": (value - 1 if key == "page_number" else value)}}
            for key, value in metadata.items()
        ]
    }

In [256]:
metadata_filter

{'$and': [{'file_name': {'$eq': 'lenovo_license_agreement.pdf'}},
  {'page_number': {'$eq': 2}}]}

In [257]:
top_chunks_with_scores = vector_db.similarity_search_with_score(query, k=3)
top_chunks_with_scores = vector_db.similarity_search_with_score(
    query=query,
    k=3,
    filter=metadata_filter
)

In [258]:
top_chunks_with_scores

[(Document(metadata={'coordinates': '[[72.02400207519531, 204.51251220703125, 79.56449127197266, 214.56719970703125], [79.58399963378906, 203.02996826171875, 82.08599853515625, 215.4229736328125], [108.0199966430664, 203.02996826171875, 172.56199645996094, 215.4229736328125], [120.5, 225.3049774169922, 123.00199890136719, 237.6709747314453], [144.02000427246094, 225.3049774169922, 542.3473510742188, 237.6709747314453], [144.02000427246094, 235.66998291015625, 184.20199584960938, 247.85598754882812], [184.22000122070312, 237.15252685546875, 246.6890411376953, 247.20721435546875], [246.64999389648438, 235.62498474121094, 251.552001953125, 247.99098205566406], [120.5, 258.09503173828125, 123.00199890136719, 270.4610290527344], [144.02000427246094, 258.09503173828125, 542.5930786132812, 270.4610290527344], [144.02000427246094, 268.4150390625, 542.578125, 280.7810363769531], [144.02000427246094, 278.7350158691406, 518.842041015625, 291.10101318359375], [72.02400207519531, 302.5825500488281,

In [259]:
# Extract chunk texts for cross-encoding
top_chunks = [chunk.page_content for chunk, _ in top_chunks_with_scores]

In [260]:
cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

In [261]:
query_chunk_pairs = [(query, chunk) for chunk in top_chunks]

In [262]:
query_chunk_pairs

[('What does the document say about Transferability in lenovo_license_agreement.pdf?',
  '6. \nTransferability \n  \nYou may not transfer or assign the Software Product to any other party, except as permitted in this \nSection 6 “Transferability”. \n  \nPreinstalled Software Products are licensed for use only on the Lenovo hardware product on which \nthey are preinstalled or with which they are included and may be transferred only with that Lenovo \nhardware product. They may not be transferred independent of the Lenovo hardware product. \n 7. \nOpen Source and Other Third-Party Products'),
 ('What does the document say about Transferability in lenovo_license_agreement.pdf?',
  'event that all or any portion of the Software Product provided hereunder does not reference, or \notherwise indicate, such Open Source License, this Agreement shall control. \n  \nAny Open Source Software is provided on an “AS IS” basis, without any indemnity, representation \nor warranty of any kind being prov

In [263]:
cross_encoded_scores = cross_encoder.predict(query_chunk_pairs)

In [264]:
# Pair chunks with their cross-encoder scores and sort them in descending order
ranked_chunks_with_scores = sorted(
    zip(top_chunks_with_scores, cross_encoded_scores),
    key=lambda x: x[1],  # Sort by score (descending)
    reverse=True
)

In [265]:
top_chunk_cross_encoded, top_cross_encoded_score = ranked_chunks_with_scores[0][0]

In [266]:
metadata_of_top_chunk = top_chunk_cross_encoded.metadata

In [267]:
metadata_of_top_chunk

{'coordinates': '[[72.02400207519531, 204.51251220703125, 79.56449127197266, 214.56719970703125], [79.58399963378906, 203.02996826171875, 82.08599853515625, 215.4229736328125], [108.0199966430664, 203.02996826171875, 172.56199645996094, 215.4229736328125], [120.5, 225.3049774169922, 123.00199890136719, 237.6709747314453], [144.02000427246094, 225.3049774169922, 542.3473510742188, 237.6709747314453], [144.02000427246094, 235.66998291015625, 184.20199584960938, 247.85598754882812], [184.22000122070312, 237.15252685546875, 246.6890411376953, 247.20721435546875], [246.64999389648438, 235.62498474121094, 251.552001953125, 247.99098205566406], [120.5, 258.09503173828125, 123.00199890136719, 270.4610290527344], [144.02000427246094, 258.09503173828125, 542.5930786132812, 270.4610290527344], [144.02000427246094, 268.4150390625, 542.578125, 280.7810363769531], [144.02000427246094, 278.7350158691406, 518.842041015625, 291.10101318359375], [72.02400207519531, 302.5825500488281, 79.56449127197266, 

In [268]:
top_chunk_cross_encoded

Document(metadata={'coordinates': '[[72.02400207519531, 204.51251220703125, 79.56449127197266, 214.56719970703125], [79.58399963378906, 203.02996826171875, 82.08599853515625, 215.4229736328125], [108.0199966430664, 203.02996826171875, 172.56199645996094, 215.4229736328125], [120.5, 225.3049774169922, 123.00199890136719, 237.6709747314453], [144.02000427246094, 225.3049774169922, 542.3473510742188, 237.6709747314453], [144.02000427246094, 235.66998291015625, 184.20199584960938, 247.85598754882812], [184.22000122070312, 237.15252685546875, 246.6890411376953, 247.20721435546875], [246.64999389648438, 235.62498474121094, 251.552001953125, 247.99098205566406], [120.5, 258.09503173828125, 123.00199890136719, 270.4610290527344], [144.02000427246094, 258.09503173828125, 542.5930786132812, 270.4610290527344], [144.02000427246094, 268.4150390625, 542.578125, 280.7810363769531], [144.02000427246094, 278.7350158691406, 518.842041015625, 291.10101318359375], [72.02400207519531, 302.5825500488281, 7

In [269]:
metadata_of_top_chunk

{'coordinates': '[[72.02400207519531, 204.51251220703125, 79.56449127197266, 214.56719970703125], [79.58399963378906, 203.02996826171875, 82.08599853515625, 215.4229736328125], [108.0199966430664, 203.02996826171875, 172.56199645996094, 215.4229736328125], [120.5, 225.3049774169922, 123.00199890136719, 237.6709747314453], [144.02000427246094, 225.3049774169922, 542.3473510742188, 237.6709747314453], [144.02000427246094, 235.66998291015625, 184.20199584960938, 247.85598754882812], [184.22000122070312, 237.15252685546875, 246.6890411376953, 247.20721435546875], [246.64999389648438, 235.62498474121094, 251.552001953125, 247.99098205566406], [120.5, 258.09503173828125, 123.00199890136719, 270.4610290527344], [144.02000427246094, 258.09503173828125, 542.5930786132812, 270.4610290527344], [144.02000427246094, 268.4150390625, 542.578125, 280.7810363769531], [144.02000427246094, 278.7350158691406, 518.842041015625, 291.10101318359375], [72.02400207519531, 302.5825500488281, 79.56449127197266, 

# LLM query

In [270]:
llm_instruction = "Please answer the question by carefully considering both the provided context and the chat history. Use the context for accurate information and take into account any relevant details from the chat history to generate a well-informed response. If enough information is not available, please respond by saying: 'There is not enough information to answer the question.'"

In [271]:
# Create the prompt
template = """
Context: 
{context}

Chat history:
{chat_history}

Instruction: 
{llm_instruction}

User question: {question}
"""

In [272]:
formatted_prompt = PromptTemplate.from_template(template).format(
    context=top_chunk_cross_encoded.page_content,
    chat_history="\n".join([f'{entry["role"].capitalize()}: {entry["content"]}' for entry in chat_history]),
    question=query,
    llm_instruction=llm_instruction
)

In [273]:
formatted_prompt

"\nContext: \n6. \nTransferability \n  \nYou may not transfer or assign the Software Product to any other party, except as permitted in this \nSection 6 “Transferability”. \n  \nPreinstalled Software Products are licensed for use only on the Lenovo hardware product on which \nthey are preinstalled or with which they are included and may be transferred only with that Lenovo \nhardware product. They may not be transferred independent of the Lenovo hardware product. \n 7. \nOpen Source and Other Third-Party Products\n\nChat history:\nUser: CDC agrees to make the 504 Loan to the Borrower subject to what?\nAssistant: SBA's approval.\nUser: Third Party Lender waives its rights for what?\nAssistant: Third Party Lender waives its rights to enforce provisions that do not comply with the 504 Loan Program Requirements.\nUser: what does this document say about authority to execute agreement?\nAssistant: The persons signing below certify that they have been duly authorized to execute this Agreement

In [274]:
# Get response from LLM
llm_response = llm.predict(formatted_prompt)

In [275]:
print("#"*50)
print(llm_response)

##################################################
Based on the context provided in the lenovo_license_agreement.pdf, the document states that the Software Product may not be transferred or assigned to any other party, except as permitted in the Transferability section. It also specifies that Preinstalled Software Products can only be transferred with the Lenovo hardware product on which they are preinstalled or included, and cannot be transferred independently of the Lenovo hardware product.


# PDF Highlighting

In [276]:
# Configurable cosine similarity threshold
COSINE_SIMILARITY_THRESHOLD = 0.5

In [277]:
page_number =  metadata_of_top_chunk["page_number"]

In [278]:
page_number

2

In [279]:
coordinates_str = metadata_of_top_chunk["coordinates"]

In [280]:
coordinates_str

'[[72.02400207519531, 204.51251220703125, 79.56449127197266, 214.56719970703125], [79.58399963378906, 203.02996826171875, 82.08599853515625, 215.4229736328125], [108.0199966430664, 203.02996826171875, 172.56199645996094, 215.4229736328125], [120.5, 225.3049774169922, 123.00199890136719, 237.6709747314453], [144.02000427246094, 225.3049774169922, 542.3473510742188, 237.6709747314453], [144.02000427246094, 235.66998291015625, 184.20199584960938, 247.85598754882812], [184.22000122070312, 237.15252685546875, 246.6890411376953, 247.20721435546875], [246.64999389648438, 235.62498474121094, 251.552001953125, 247.99098205566406], [120.5, 258.09503173828125, 123.00199890136719, 270.4610290527344], [144.02000427246094, 258.09503173828125, 542.5930786132812, 270.4610290527344], [144.02000427246094, 268.4150390625, 542.578125, 280.7810363769531], [144.02000427246094, 278.7350158691406, 518.842041015625, 291.10101318359375], [72.02400207519531, 302.5825500488281, 79.56449127197266, 312.637237548828

In [281]:
coordinates = json.loads(coordinates_str)

In [282]:
coordinates

[[72.02400207519531,
  204.51251220703125,
  79.56449127197266,
  214.56719970703125],
 [79.58399963378906, 203.02996826171875, 82.08599853515625, 215.4229736328125],
 [108.0199966430664,
  203.02996826171875,
  172.56199645996094,
  215.4229736328125],
 [120.5, 225.3049774169922, 123.00199890136719, 237.6709747314453],
 [144.02000427246094, 225.3049774169922, 542.3473510742188, 237.6709747314453],
 [144.02000427246094,
  235.66998291015625,
  184.20199584960938,
  247.85598754882812],
 [184.22000122070312,
  237.15252685546875,
  246.6890411376953,
  247.20721435546875],
 [246.64999389648438,
  235.62498474121094,
  251.552001953125,
  247.99098205566406],
 [120.5, 258.09503173828125, 123.00199890136719, 270.4610290527344],
 [144.02000427246094,
  258.09503173828125,
  542.5930786132812,
  270.4610290527344],
 [144.02000427246094, 268.4150390625, 542.578125, 280.7810363769531],
 [144.02000427246094, 278.7350158691406, 518.842041015625, 291.10101318359375],
 [72.02400207519531, 302.582

In [283]:
file_for_highlighting = os.path.join(pdf_folder, metadata_of_top_chunk['file_name'])

In [284]:
file_for_highlighting

'C:\\gitworkspace\\aimldemo\\AIML Demo\\Python Demo\\pythonbasics\\doc_inputs\\lenovo_license_agreement.pdf'

In [285]:
highlighted_pdf_path

'C:\\gitworkspace\\aimldemo\\AIML Demo\\Python Demo\\pythonbasics\\doc_outputs\\highlighted_output.pdf'

In [286]:
doc = fitz.open(file_for_highlighting)

In [287]:
page = doc.load_page(page_num)

In [288]:
type(page) 

pymupdf.Page

In [289]:
page.get_text()

'Lenovo License Agreement \n \n \nPage 3 of 6 \nCOE-30002-01 Lenovo License Agreement 03.2022 \n \nYou are solely responsible for maintaining the confidentiality of your Account information and \npassword. At all times, You are solely liable for any activities occurring on or through your Account, \neven if such activities may not be authorized by You. \n \nYou shall immediately notify Lenovo of any unauthorized use of your Account or if any other breach \nof security has occurred. In no event shall Lenovo be liable for any unauthorized use of your Account. \n \nYou shall be the Account owner when you create the Account. Subsequent changes to ownership \nmust be supported by appropriate legal documents. Lenovo will not adjudicate any ownership-related \ndispute. If Lenovo is unable to determine the valid owner of the Account, Lenovo may at its own \ndiscretion suspend or terminate the Account. \n6. \nTransferability \n \nYou may not transfer or assign the Software Product to any other 

In [290]:
chunk_text = " ".join(page.get_textbox(fitz.Rect(*coord)) for coord in coordinates)

In [291]:
chunk_text

'6.   Transferability    You may not transfer or assign the Software Product to any other party, except as permitted in this \nSection 6 “Transferability”.  You may n\nSection 6  not transfer or a\n“Transferability”. ass\n”.    Preinstalled Software Products are licensed for use only on the Lenovo hardware product on which \nthey are preinstalled or with which they are included and may be transferred only with that Lenovo  Preinstalled Software Products are licensed for use only on the Lenovo hardware product on which \nthey are preinstalled or with which they are included and may be transferred only with that Lenovo \nhardware product. They may not be transferred independent of the Lenovo hardware product.  they are preinstalled or with which they are included and may be transferred only with that Le\nhardware product. They may not be transferred independent of the Lenovo hardware product.  7.   Open Source and Other Third-Party Products '

In [292]:
sentences = nltk.sent_tokenize(chunk_text)

In [293]:
# Initialize Sentence Transformer for cosine similarity
similarity_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

In [294]:
# Compute embeddings for the LLM response and sentences
llm_response_embedding = similarity_model.encode(llm_response, convert_to_tensor=True)
sentence_embeddings = similarity_model.encode(sentences, convert_to_tensor=True)

In [295]:
# Compute cosine similarities
similarities = util.cos_sim(llm_response_embedding, sentence_embeddings)[0].tolist()

In [296]:
# Pair sentences with their similarity scores
sentences_with_scores = list(zip(sentences, similarities))

In [297]:
# Sort sentences by similarity score in descending order
sorted_sentences_with_scores = sorted(sentences_with_scores, key=lambda x: x[1], reverse=True)

In [298]:
for sentence, score in sorted_sentences_with_scores:
    print(f"Score: {score:.4f}, Sentence: {sentence}")
    print("#"*20)

Score: 0.8419, Sentence: Preinstalled Software Products are licensed for use only on the Lenovo hardware product on which 
they are preinstalled or with which they are included and may be transferred only with that Lenovo  Preinstalled Software Products are licensed for use only on the Lenovo hardware product on which 
they are preinstalled or with which they are included and may be transferred only with that Lenovo 
hardware product.
####################
Score: 0.7350, Sentence: Transferability    You may not transfer or assign the Software Product to any other party, except as permitted in this 
Section 6 “Transferability”.
####################
Score: 0.7172, Sentence: They may not be transferred independent of the Lenovo hardware product.
####################
Score: 0.7172, Sentence: They may not be transferred independent of the Lenovo hardware product.
####################
Score: 0.4949, Sentence: they are preinstalled or with which they are included and may be transferred only wi

In [299]:
# Find high-similarity sentences and their coordinates
high_similarity_sentences = []
for sentence, score in sorted_sentences_with_scores:
    if score >= COSINE_SIMILARITY_THRESHOLD:
        search_results = page.search_for(sentence)
        if search_results:  # Ensure search_results is not empty
            high_similarity_sentences.append(fitz.Rect(*search_results[0]))

In [300]:
high_similarity_sentences

[Rect(108.0199966430664, 203.02996826171875, 172.56199645996094, 215.4229736328125),
 Rect(219.0529327392578, 278.7350158691406, 516.3160400390625, 291.10101318359375),
 Rect(219.0529327392578, 278.7350158691406, 516.3160400390625, 291.10101318359375)]

In [301]:
# Highlight the entire chunk in yellow
for coord in coordinates:
    rect = fitz.Rect(*coord)
    page.add_highlight_annot(rect)

In [302]:
# Highlight high-similarity sentences in green
for rect in high_similarity_sentences:
    highlight = page.add_highlight_annot(rect)
    highlight.set_colors(stroke=(0, 1, 0))  # Green color
    highlight.update()

In [303]:
doc.save(highlighted_pdf_path, garbage=4)
doc.close()