In [1]:
import os
import shutil
import json
import requests
import fitz
import nltk
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from sentence_transformers import SentenceTransformer, CrossEncoder, util
from chromadb.config import Settings
from langchain.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
from langchain_community.llms import HuggingFaceHub

  from tqdm.autonotebook import tqdm, trange
  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'


# Environment setup

In [2]:
def setup_environment():
    import sys
    sys.path.append('C:\\gitworkspace\\aimldemo\\jupyterworkapce')
    import stratup_env_setup
    stratup_env_setup.set_env()

In [3]:
setup_environment()

In [4]:
class HuggingFaceEmbeddings:
    """Custom wrapper for Hugging Face SentenceTransformer embeddings."""
    def __init__(self, model_name="sentence-transformers/all-MiniLM-L6-v2"):
        self.embedding_model = SentenceTransformer(model_name)
    
    def embed_documents(self, texts):
        """Generate embeddings for a list of documents."""
        return self.embedding_model.encode(texts, convert_to_tensor=False).tolist()
    
    def embed_query(self, text):
        """Generate embedding for a single query."""
        return self.embedding_model.encode(text, convert_to_tensor=False).tolist()

In [5]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jimmy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
chunk_size=500
chunk_overlap=50

In [7]:
#pdf_path = "C:\\gitworkspace\\aimldemo\\AIML Demo\\Python Demo\\pythonbasics\\doc_inputs\\Form_2287.pdf"
pdf_folder = "C:\\gitworkspace\\aimldemo\\AIML Demo\\Python Demo\\pythonbasics\\doc_inputs"
output_folder = "C:\\gitworkspace\\aimldemo\\AIML Demo\\Python Demo\\pythonbasics\\doc_outputs"

In [8]:
highlighted_pdf_path = os.path.join(output_folder, "highlighted_output.pdf")

In [9]:
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.7)
#llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0.7)
#llm = HuggingFaceHub(repo_id="tiiuae/falcon-7b-instruct", model_kwargs={"temperature": 0.7, "max_length": 512})


In [10]:
CREDIT_REVIEW_API_URL = "http://127.0.0.1:8000/getCreditReviewData"

# Vectorize PDF Document

In [11]:
db_path = "chroma_db"

In [12]:
if os.path.exists(db_path):
        print(f"[INFO] Deleting existing Chroma database at {db_path} to avoid conflicts.")
        shutil.rmtree(db_path, ignore_errors=True)

In [13]:
embeddings = HuggingFaceEmbeddings()

In [14]:
chroma_settings = Settings(persist_directory=db_path, anonymized_telemetry=False)

In [15]:
vector_db = Chroma(persist_directory=db_path, embedding_function=embeddings, client_settings=chroma_settings)

  vector_db = Chroma(persist_directory=db_path, embedding_function=embeddings, client_settings=chroma_settings)


In [16]:
def extract_chunks_with_coordinates(pdf_path):
    """Extract chunks from the PDF along with their coordinates (bounding boxes) and file name."""
    doc = fitz.open(pdf_path)
    chunks_with_metadata = []

    # Extract the file name from the full path
    file_name = os.path.basename(pdf_path)

    for page_num in range(len(doc)):
        page = doc[page_num]
        page_text = page.get_text("blocks")
        full_text = " ".join(block[4] for block in page_text)

        splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        text_chunks = splitter.split_text(full_text)

        for chunk in text_chunks:
            coordinates = []
            search_results = page.search_for(chunk)
            for rect in search_results:
                coordinates.append((rect.x0, rect.y0, rect.x1, rect.y1))

            chunks_with_metadata.append({
                "text": chunk,
                "page": page_num,
                "coordinates": coordinates,
                "file_name": file_name  # Add file name to metadata
            })

    return chunks_with_metadata


In [17]:
pdf_files = [os.path.join(pdf_folder, f) for f in os.listdir(pdf_folder) if f.endswith(".pdf")]

In [18]:
pdf_files

['C:\\gitworkspace\\aimldemo\\AIML Demo\\Python Demo\\pythonbasics\\doc_inputs\\Form_2287.pdf',
 'C:\\gitworkspace\\aimldemo\\AIML Demo\\Python Demo\\pythonbasics\\doc_inputs\\lenovo_license_agreement.pdf']

In [19]:
text_chunks_with_metadata = []
for pdf_file in pdf_files:
    chunks = extract_chunks_with_coordinates(pdf_file)
    text_chunks_with_metadata.extend(chunks)

In [20]:
text_chunks_with_metadata[0]

{'text': 'SBA Form 2287 (04-18) \n \n          1 \n Previous Editions Obsolete \n  \n \n  \n  \n  \n THIRD PARTY LENDER AGREEMENT \n  \n \n \nTHIS THIRD PARTY LENDER AGREEMENT (“Agreement") is dated this ______ day \nof______________, 20____, by and between ________________________________________, \n(“Third Party Lender”) whose address is ____________________________________________, \nand________________________________________________________, (“CDC”) whose address is',
 'page': 0,
 'coordinates': [(74.28336334228516,
   736.0769653320312,
   161.34852600097656,
   747.1239013671875),
  (287.99462890625, 736.0769653320312, 290.229736328125, 747.1239013671875),
  (395.9959411621094, 736.0769653320312, 424.5137939453125, 747.1239013671875),
  (72.0, 745.435546875, 172.51608276367188, 756.4824829101562),
  (141.0, 82.16398620605469, 144.0, 98.17198944091797),
  (177.0, 82.16398620605469, 180.0, 98.17198944091797),
  (138.72000122070312,
   107.96397399902344,
   141.72000122070312,
   

In [21]:
page_number = 2
file_name = 'Form_2287.pdf'

In [22]:
for chunk in text_chunks_with_metadata:
    if chunk['file_name'] == file_name and chunk['page'] == page_number:
        print("Text Chunk:", chunk['text'])
        print("Page Number:", chunk['page'])
        print("File Name:", chunk['file_name'])
        print("#" * 50)

Text Chunk: SBA Form 2287 (04-18) 
 
          3 
 Previous Editions Obsolete 
 failure to pay taxes when due or violation of  any financial covenants which would cause a prudent 
lender to believe that the  prospect of payment or performance of the Third Party Note is impaired. 
 
 
c. 
No Cross-Collateralization.   Third Party Lender agrees that the Common Collateral 
 will only secure its Third Party Loan and the Common Collateral is not currently, and will not be
Page Number: 2
File Name: Form_2287.pdf
##################################################
Text Chunk: used in the future, as security for any other financing provided by Third Party Lender to Borrower 
that purports to be in a superior position to that of the CDC Lien, unless authorized in writing by 
CDC and SBA. 
 
 
d. 
No Cross-Default.   During the term of the 504 Loan, Third Party Lender will not 
 exercise any cross-default, "deem at-risk," or any other provisions in documents evidencing the Third
Page Number: 2
Fi

In [23]:
chunk_texts = [item["text"] for item in text_chunks_with_metadata]
metadata = [{"page_number": item["page"], "file_name": item["file_name"], "coordinates": json.dumps(item["coordinates"])} for item in text_chunks_with_metadata]

In [24]:
chunk_texts[0]

'SBA Form 2287 (04-18) \n \n          1 \n Previous Editions Obsolete \n  \n \n  \n  \n  \n THIRD PARTY LENDER AGREEMENT \n  \n \n \nTHIS THIRD PARTY LENDER AGREEMENT (“Agreement") is dated this ______ day \nof______________, 20____, by and between ________________________________________, \n(“Third Party Lender”) whose address is ____________________________________________, \nand________________________________________________________, (“CDC”) whose address is'

In [25]:
metadata[0]

{'page_number': 0,
 'file_name': 'Form_2287.pdf',
 'coordinates': '[[74.28336334228516, 736.0769653320312, 161.34852600097656, 747.1239013671875], [287.99462890625, 736.0769653320312, 290.229736328125, 747.1239013671875], [395.9959411621094, 736.0769653320312, 424.5137939453125, 747.1239013671875], [72.0, 745.435546875, 172.51608276367188, 756.4824829101562], [141.0, 82.16398620605469, 144.0, 98.17198944091797], [177.0, 82.16398620605469, 180.0, 98.17198944091797], [138.72000122070312, 107.96397399902344, 141.72000122070312, 123.97197723388672], [72.0, 133.7639617919922, 75.0, 149.77195739746094], [306.0, 159.56394958496094, 309.0, 175.5719451904297], [155.0399932861328, 184.8782196044922, 168.40667724609375, 212.6336212158203], [168.36000061035156, 189.1377410888672, 214.11094665527344, 211.24234008789062], [214.0800018310547, 184.8782196044922, 226.32444763183594, 212.6336212158203], [226.32000732421875, 189.1377410888672, 275.67633056640625, 211.24234008789062], [275.760009765625, 1

In [26]:
vector_db.add_texts(chunk_texts, metadatas=metadata)

['f18b0687-9403-47b7-a0dc-d01048fa10ff',
 '1285a9b6-690d-4445-892b-9a3c068b9684',
 '34ddb139-9fcb-4670-a21c-2bea78d482b7',
 '53d6b17f-2ff3-409e-936d-1bfabadd9e9a',
 '09dd9947-3faf-47f6-80eb-029e71305133',
 '836b8faf-12b1-4a83-85e3-61a7a84b3b9a',
 'd0cce04e-9f4e-405a-9b1a-57263bc11ebb',
 'c2537c22-0a89-46fe-8bee-b3d11b9f746c',
 '4a39e2a4-6f55-4fee-842e-1be42bbc1558',
 '77f4875a-73b6-4987-9565-ccf2c6e4a46e',
 '02474009-d6e1-4d5f-b06f-36dce85148de',
 '616f1210-7f42-46af-932c-74596f10a0f0',
 '299459ad-40cd-43c7-9b02-4ca5499726b6',
 'db52ca2d-578d-4934-b30c-3cc2700f51df',
 '8968c79c-cf4c-46ed-a3be-d9d3f3585c06',
 '55a49efc-4ecf-45f4-9bc7-2c03001c5393',
 'aed0f69a-be05-4805-ac5a-aefd726f35a9',
 '3e00b9a2-ff06-48c2-8850-eeecb7fed0af',
 '1533c79b-5646-4cca-87b8-e00425aa2b07',
 '1ce71e11-1151-4716-b622-b5402361a9d8',
 '4cb51f8c-abfb-471d-b312-ecf185d98f11',
 'c4500e49-57b3-49b3-b30d-acd42e760567',
 'b6259c1d-e54b-45f8-a4f3-3a18bb286cc1',
 '14892ba5-5af7-4d44-b3db-339f63945eed',
 '347eee04-7348-

In [27]:
vector_db.persist()

  vector_db.persist()


# Chat Query

In [28]:
#user_query = 'What does this document say about Balloon payments?'
#user_query = "what does this document say about authority to execute agreement?"
#user_query = 'About Balloon payments in page_number 3 of Form_2287.pdf?'
user_query = 'What does the docuemnt say about Transferability in page_number 3 of lenovo_license_agreement.pdf?'
#user_query = 'How many credit reviews are there for cagid 123?'

In [29]:
chat_history = [
    {"role": "user", "content": "CDC agrees to make the 504 Loan to the Borrower subject to what?"},
    {"role": "assistant", "content": "SBA's approval."},
    {"role": "user", "content": "Third Party Lender waives its rights for what?"},
    {"role": "assistant", "content": "Third Party Lender waives its rights to enforce provisions that do not comply with the 504 Loan Program Requirements."},
    {"role": "user", "content": "what does this document say about authority to execute agreement?"},
    {"role": "assistant", "content": "The persons signing below certify that they have been duly authorized to execute this Agreement on behalf of their respective party."}
]

* Rephrase the query to identify teh user's intended full question using LLM

In [30]:
# Template to rephrase the user query and extract metadata
template = """
Given the chat history and the current user query, rewrite the query so it forms a complete question, maintaining the context of the conversation. Then, extract metadata (only "file_name" and "page_number" fields) if they are mentioned in the question or chat history.
Rewritten query shoudl not have refernce to metadata fields file_name and page_number.

Chat History:
{chat_history}

Current user query: {query}

Rewritten query:
[Provide the rewritten query here.]

Extracted Metadata (JSON format with only file_name and page_number fields):
[Provide the metadata here as a valid JSON object.]
"""

In [31]:
# Format the chat history into a string
formatted_chat_history = "\n".join([f'{entry["role"].capitalize()}: {entry["content"]}' for entry in chat_history])

In [32]:
# Create the prompt for the LLM
prompt = PromptTemplate.from_template(template).format(
    chat_history=formatted_chat_history,
    query=user_query
)

In [33]:
prompt 

'\nGiven the chat history and the current user query, rewrite the query so it forms a complete question, maintaining the context of the conversation. Then, extract metadata (only "file_name" and "page_number" fields) if they are mentioned in the question or chat history.\nRewritten query shoudl not have refernce to metadata fields file_name and page_number.\n\nChat History:\nUser: CDC agrees to make the 504 Loan to the Borrower subject to what?\nAssistant: SBA\'s approval.\nUser: Third Party Lender waives its rights for what?\nAssistant: Third Party Lender waives its rights to enforce provisions that do not comply with the 504 Loan Program Requirements.\nUser: what does this document say about authority to execute agreement?\nAssistant: The persons signing below certify that they have been duly authorized to execute this Agreement on behalf of their respective party.\n\nCurrent user query: What does the docuemnt say about Transferability in page_number 3 of lenovo_license_agreement.pdf

In [34]:
response = llm.invoke(prompt)

In [37]:
response

AIMessage(content='Rewritten query: What does the document say about Transferability in the Lenovo License Agreement?\n\nExtracted Metadata:\n{\n  "file_name": "lenovo_license_agreement.pdf",\n  "page_number": 3\n}', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 46, 'prompt_tokens': 250, 'total_tokens': 296, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-70ef3a3b-6894-48b9-9b9c-a99217c43877-0', usage_metadata={'input_tokens': 250, 'output_tokens': 46, 'total_tokens': 296, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}})

In [38]:
response_content = response.content  

In [41]:
# Split the response into rewritten query and metadata parts
response_parts = response_content.split("\nExtracted Metadata:")

In [42]:
# Print the extracted parts
print("Rewritten Query:", response_parts[0].strip())
if len(response_parts) > 1:
    print("Extracted Metadata:", response_parts[1].strip())

Rewritten Query: Rewritten query: What does the document say about Transferability in the Lenovo License Agreement?
Extracted Metadata: {
  "file_name": "lenovo_license_agreement.pdf",
  "page_number": 3
}


In [48]:
query = response_parts[0].replace("Rewritten query:", "").strip()

In [44]:
metadata_str = response_parts[1].strip()

In [45]:
metadata = json.loads(metadata_str)

In [46]:
metadata

{'file_name': 'lenovo_license_agreement.pdf', 'page_number': 3}

In [47]:
query

'What does the document say about Transferability in the Lenovo License Agreement?'

# Agent for Rest Services

In [None]:
prompt_template = PromptTemplate(
    input_variables=["query"],
    template="""
    Analyze the following user query and determine whether to query the credit review REST API or search the vector database.

    If the query is related to credit review (e.g., approvals, CAGID, or review type), determine:
    - Mandatory parameters: CAGID
    - Optional filters: creationDate, creditReviewType, approvedBy, tfa
    - Which specific field the user wants (e.g., "approvedBy").

    Return a JSON object with the following fields:
    - "invoke_rest": true/false
    - "params": {{"CAGID": "<value>", "creationDate": "<value>", "creditReviewType": "<value>", "approvedBy": "<value>", "tfa": "<value>"}}
    - "required_field": the specific field the user needs, if applicable.

    User query: "{query}"
    """
)

In [None]:
formatted_prompt = prompt_template.format(query=user_query)

In [None]:
response = llm.predict(formatted_prompt)

In [None]:
response

In [None]:
import re

In [None]:
match = re.search(r'\{.*\}', response, re.DOTALL)
if match:
    json_str = match.group(0)
    try:
        response_dict = json.loads(json_str)
        print("Parsed JSON successfully:", response_dict)
    except json.JSONDecodeError as e:
        print("Error parsing JSON:", e)
else:
    print("No JSON object found in response.")

In [None]:
if response_dict.get("invoke_rest"):
    print("Calling REST API with parameters:", response_dict["params"])
else:
    print("No REST API call needed.")

# If Condition to call REST Endpoint

In [None]:
api_params = {
    "cagid": response_dict["params"].get("CAGID")  # Convert key to lowercase
}

In [None]:
api_params

In [None]:
response = requests.get(CREDIT_REVIEW_API_URL, params=api_params)

In [None]:
response

In [None]:
if response.status_code == 200:
    api_response = response.json()
    print("API Response Received Successfully:")
    print(json.dumps(api_response, indent=4))
else:
    print(f"API call failed with status code: {response.status_code}")
    print(response.text)

In [None]:
prompt = f"""
You are an intelligent assistant. I have the following JSON data retrieved from a REST API:

JSON Structure:
- Each element in the JSON array represents a credit review record.
- Fields:
    - `cagid`: The unique identifier for the customer.
    - `creationDate`: The date the credit review was created.
    - `creditReviewType`: The type of credit review (e.g., ANNUAL_REVIEW, INTERIM_REVIEW, CREDIT_MEMO).
    - `camId`: The unique identifier for the credit application memo.
    - `approvedBy`: The SOEID of the approver.
    - `tfa`: The total facility amount for the credit review.

JSON Data:
{json.dumps(api_response, indent=4)}

Using the JSON data above, answer the following query in natural language:
{user_query}
"""

In [None]:
response = llm.predict(prompt)

In [None]:
response

# Fetch relevant chunks from Vector DB- (Else condition)

In [None]:
 metadata_filter = {
        "$and": [
            {key: {"$eq": (value - 1 if key == "page_number" else value)}}
            for key, value in metadata.items()
        ]
    }

In [None]:
metadata_filter

In [None]:
top_chunks_with_scores = vector_db.similarity_search_with_score(query, k=3)
top_chunks_with_scores = vector_db.similarity_search_with_score(
    query=query,
    k=3,
    filter=metadata_filter
)

In [None]:
top_chunks_with_scores

In [None]:
# Extract chunk texts for cross-encoding
top_chunks = [chunk.page_content for chunk, _ in top_chunks_with_scores]

In [None]:
cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

In [None]:
query_chunk_pairs = [(query, chunk) for chunk in top_chunks]

In [None]:
query_chunk_pairs

In [None]:
cross_encoded_scores = cross_encoder.predict(query_chunk_pairs)

In [None]:
# Pair chunks with their cross-encoder scores and sort them in descending order
ranked_chunks_with_scores = sorted(
    zip(top_chunks_with_scores, cross_encoded_scores),
    key=lambda x: x[1],  # Sort by score (descending)
    reverse=True
)

In [None]:
top_chunk_cross_encoded, top_cross_encoded_score = ranked_chunks_with_scores[0][0]

In [None]:
metadata_of_top_chunk = top_chunk_cross_encoded.metadata

In [None]:
metadata_of_top_chunk

In [None]:
top_chunk_cross_encoded

In [None]:
metadata_of_top_chunk

# LLM query

In [None]:
llm_instruction = "Please answer the question by carefully considering both the provided context and the chat history. Use the context for accurate information and take into account any relevant details from the chat history to generate a well-informed response. If enough information is not available, please respond by saying: 'There is not enough information to answer the question.'"

In [None]:
# Create the prompt
template = """
Context: 
{context}

Chat history:
{chat_history}

Instruction: 
{llm_instruction}

User question: {question}
"""

In [None]:
formatted_prompt = PromptTemplate.from_template(template).format(
    context=top_chunk_cross_encoded.page_content,
    chat_history="\n".join([f'{entry["role"].capitalize()}: {entry["content"]}' for entry in chat_history]),
    question=query,
    llm_instruction=llm_instruction
)

In [None]:
formatted_prompt

In [None]:
# Get response from LLM
llm_response = llm.predict(formatted_prompt)

In [None]:
print("#"*50)
print(llm_response)

# PDF Highlighting

In [None]:
# Configurable cosine similarity threshold
COSINE_SIMILARITY_THRESHOLD = 0.5

In [None]:
page_number =  metadata_of_top_chunk["page_number"]

In [None]:
page_number

In [None]:
coordinates_str = metadata_of_top_chunk["coordinates"]

In [None]:
coordinates_str

In [None]:
coordinates = json.loads(coordinates_str)

In [None]:
coordinates

In [None]:
file_for_highlighting = os.path.join(pdf_folder, metadata_of_top_chunk['file_name'])

In [None]:
file_for_highlighting

In [None]:
highlighted_pdf_path

In [None]:
doc = fitz.open(file_for_highlighting)

In [None]:
page = doc.load_page(page_num)

In [None]:
type(page) 

In [None]:
page.get_text()

In [None]:
chunk_text = " ".join(page.get_textbox(fitz.Rect(*coord)) for coord in coordinates)

In [None]:
chunk_text

In [None]:
sentences = nltk.sent_tokenize(chunk_text)

In [None]:
# Initialize Sentence Transformer for cosine similarity
similarity_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

In [None]:
# Compute embeddings for the LLM response and sentences
llm_response_embedding = similarity_model.encode(llm_response, convert_to_tensor=True)
sentence_embeddings = similarity_model.encode(sentences, convert_to_tensor=True)

In [None]:
# Compute cosine similarities
similarities = util.cos_sim(llm_response_embedding, sentence_embeddings)[0].tolist()

In [None]:
# Pair sentences with their similarity scores
sentences_with_scores = list(zip(sentences, similarities))

In [None]:
# Sort sentences by similarity score in descending order
sorted_sentences_with_scores = sorted(sentences_with_scores, key=lambda x: x[1], reverse=True)

In [None]:
for sentence, score in sorted_sentences_with_scores:
    print(f"Score: {score:.4f}, Sentence: {sentence}")
    print("#"*20)

In [None]:
# Find high-similarity sentences and their coordinates
high_similarity_sentences = []
for sentence, score in sorted_sentences_with_scores:
    if score >= COSINE_SIMILARITY_THRESHOLD:
        search_results = page.search_for(sentence)
        if search_results:  # Ensure search_results is not empty
            high_similarity_sentences.append(fitz.Rect(*search_results[0]))

In [None]:
high_similarity_sentences

In [None]:
# Highlight the entire chunk in yellow
for coord in coordinates:
    rect = fitz.Rect(*coord)
    page.add_highlight_annot(rect)

In [None]:
# Highlight high-similarity sentences in green
for rect in high_similarity_sentences:
    highlight = page.add_highlight_annot(rect)
    highlight.set_colors(stroke=(0, 1, 0))  # Green color
    highlight.update()

In [None]:
doc.save(highlighted_pdf_path, garbage=4)
doc.close()