In [1]:
import PyPDF2
import re
import json
from openai import OpenAI
from pydantic import BaseModel, Field
from typing import Optional, List
from datetime import datetime
import torch
import ollama
import spacy
from sentence_transformers import SentenceTransformer, util




In [2]:
CHUNK_SIZE = 300
CYAN = '\033[96m'
NEON_GREEN = '\033[92m'
RESET_COLOR = '\033[0m'

client = OpenAI(
    base_url='http://localhost:11434/v1',
    api_key='llama3.2:3b'
)

class CreditAgreementMetadata(BaseModel):
    issuer: Optional[str] = None
    administrative_agent: Optional[str] = None
    underwriter: Optional[str] = None
    agreement_date: Optional[str] = Field(None, description="Agreement date in YYYY-MM-DD format")

class CovenantDetail(BaseModel):
    title: str
    description: str

class Covenant(BaseModel):
    items: List[CovenantDetail]

def prompts_and_execution(content, isMetaData):
    if isMetaData:
        prompt = f"""
            ### System Instruction:  
            You are an AI model tasked with extracting specific financial details from documents. **Follow these instructions precisely:**  
            - Return the output in **strict JSON format** with the specified keys.  
            - If a key's information is missing, return an **empty string ("")** instead of omitting the key.  
            - Do **not** add extra text, explanations, or additional keys
            - **Don't need your <think> tags in the answer**
            
            ### **Extraction Task:**  
            Extract the following details from the provided document:  
            
            - **issuer**: The entity issuing the agreement.  
            - **administrative_agent**: The administrative agent managing the agreement.  
            - **underwriter**: The underwriter, bookrunner, or lead arranger.  
            - **agreement_date**: The agreement date in **YYYY-MM-DD** format.  
            
            ### **Document Content:**  
            {content}
            """
    else:
        prompt = f"""
            ### System Instruction:  
            You are an helpful AI assistant tasked with extracting specific financial details from documents. **Follow these instructions precisely:**  
            - Return the output in **strict JSON format** with the **specified keys** as mentioned in extraction template 
            - No Extra information is needed 
            - no triple single quotes needed in output to represent json output 
            - add to list only if you have data for title and description. Don't add if you have empty description from the document
            
            ### **Extraction Task:**  
            Extract the list of covenants, terms & conditions and details from the provided document with the list of covenant type as key named 'title' and its summary/description of the respective covenant type as value named 'description' in around 30 words.
            
            [
                {{
                    "title": "<covenant type 1>" **In 2-5 word**
                    "description": "<description of covenant type 1>" **In 30 words**
                }},
                {{
                    "title": "<covenant type 2>" **In 2-5 words**
                    "description": "<description of covenant type 2>" **In 30 words**
                }},
                {{
                    "title": "<covenant type 3>" **In 2-5 words**
                    "description": "<description of covenant type 3>" **In 30 words**
                }},
                ... **TO BE CONTINUED**
            ]

            ### **Document Content:**  
            {content}
            """
    # Send the structured prompt to Ollama
    response = client.chat.completions.create(
        model="llama3.2:3b",  # Ensure this matches your model name
        messages=[{"role": "user", "content": prompt}]
    )
    
    # Extract the text response from the model
    response_text = response.choices[0].message.content
    return response_text

In [3]:
def is_legal_structure(text):
    """Check if text matches the legal-style structure of Articles and Sections."""
    pattern = r"(Article\s+[IVXLCDM]+\s+.*\d+|Section\s+\d+\.\d+\s+.*\d+)"
    return re.search(pattern, text) is not None
    
def clean_text(text):
    # Remove URLs and SEC archive references
    text = re.sub(r'https?://\S+|sec\.gov/Archives/\S+', '', text)
    # Remove metadata like EX-10.1 and timestamps
    text = re.sub(r'EX-\d+\.\d+.*', '', text)
    text = re.sub(r'\d+/\d+/\d+, \d+:\d+ \w+', '', text)
    # Remove trailing page numbers (e.g., 1/274)
    text = re.sub(r'\b\d+/\d+\b', '', text)

    if is_legal_structure(text):
        return ""

    # Remove extra spaces and newlines
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def getDataFromPdf(pdf_path):
    pdf_reader = PyPDF2.PdfReader(pdf_path)
    text_list = []
    for page in pdf_reader.pages:
        page_text = page.extract_text()
        text_list.append(clean_text(page_text))
    return " ".join(text_list)  # Join all pages with a space

def getMetaDataFromPdf(pdf_path):
    pdf_reader = PyPDF2.PdfReader(pdf_path)
    text = pdf_reader.pages[0].extract_text() if pdf_reader.pages else None
    return clean_text(text)

In [4]:
nlp = spacy.load("en_core_web_sm")
embedder = SentenceTransformer("all-MiniLM-L6-v2")  # Efficient semantic embeddings

def prepareChunks(text, chunk_size=CHUNK_SIZE, output_file="vault.txt", similarity_threshold=0.6):
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    # Use spaCy for sentence segmentation
    doc = nlp(text)
    sentences = [sent.text.strip() for sent in doc.sents]

    chunks = []
    current_chunk = []
    current_chunk_embedding = None  # Store embeddings for semantic similarity checks

    for sentence in sentences:
        sentence_embedding = embedder.encode(sentence, convert_to_tensor=True)

        # If adding a sentence exceeds chunk size OR it's semantically different → Start a new chunk
        if (sum(len(s) for s in current_chunk) + len(sentence) + 1 > chunk_size or
                (current_chunk_embedding is not None and 
                 util.pytorch_cos_sim(current_chunk_embedding, sentence_embedding).item() < similarity_threshold)):
            chunks.append(" ".join(current_chunk))
            current_chunk = [sentence]
            current_chunk_embedding = sentence_embedding
        else:
            current_chunk.append(sentence)
            # Update chunk embedding as average of existing embeddings
            current_chunk_embedding = sentence_embedding if current_chunk_embedding is None else \
                                      (current_chunk_embedding + sentence_embedding) / 2

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    # Save chunks to a file
    with open(output_file, "w", encoding="utf-8") as vault_file:
        vault_file.write("\n\n".join(chunks))  # Efficient file writing

    print(f"PDF content stored in {output_file} with semantic chunking.")
    
    return chunks

In [5]:
def prepareEmbeddings(chunks):
    print(NEON_GREEN + "Generating embeddings for the vault content..." + RESET_COLOR)
    vault_embeddings = []
    for chunk in chunks:
        try:
            response = ollama.embeddings(model='nomic-embed-text', prompt=chunk)
            embedding = response.get("embedding")
            if embedding:  # Ensure embedding is not None
                vault_embeddings.append(embedding)
            else:
                print(f"Skipping invalid embedding for content: {chunk.strip()}")
        except Exception as e:
            print(f"Failed to generate embedding for content: {chunk.strip()}. Error: {e}")

    if not vault_embeddings:
        print("No valid embeddings generated. Exiting...")
        exit(1)
    return vault_embeddings

In [6]:
def convertEmbeddingsToTensor(vault_embeddings):
    # Ensure all embeddings have the same size
    embedding_size = len(vault_embeddings[0])
    if any(len(e) != embedding_size for e in vault_embeddings):
        print("Embedding size mismatch detected. Skipping invalid embeddings...")
        vault_embeddings = [e for e in vault_embeddings if len(e) == embedding_size]

    # Convert to tensor
    vault_embeddings_tensor = torch.tensor(vault_embeddings)
    print("Embeddings for each line in the vault:")
    print(vault_embeddings_tensor)
    return vault_embeddings_tensor

In [7]:
def get_relevant_context(rewritten_input, vault_embeddings, vault_content, top_k=15):
    if not rewritten_input.strip():
        print("Rewritten input is empty. Skipping context retrieval.")
        return []

    if vault_embeddings.nelement() == 0:  # Check if the tensor has any elements
        print("Vault embeddings are empty. Skipping context retrieval.")
        return []

    try:
        input_embedding = ollama.embeddings(model='nomic-embed-text', prompt=rewritten_input)["embedding"]
    except Exception as e:
        print(f"Failed to generate input embedding. Error: {e}")
        return []

    if not input_embedding:
        print("Input embedding is invalid. Skipping context retrieval.")
        return []

    # Compute cosine similarity
    cos_scores = torch.cosine_similarity(torch.tensor(input_embedding).unsqueeze(0), vault_embeddings)

    # Adjust top_k if needed
    top_k = min(top_k, len(cos_scores))
    top_indices = torch.topk(cos_scores, k=top_k)[1].tolist()

    # Retrieve relevant context
    relevant_context = [vault_content[idx].strip() for idx in top_indices]
    return relevant_context

In [8]:
def getRelevantEmbeddings(query, embeddings, content):
    relevant_context = get_relevant_context(query, embeddings, content)
    if relevant_context:
        context_str = "\n".join(relevant_context)
        print("Context Pulled from Documents: \n\n" + CYAN + context_str + RESET_COLOR)
    else:
        print(CYAN + "No relevant context found." + RESET_COLOR)

    return prompts_and_execution(context_str, False)

In [9]:
text_data = getDataFromPdf("./Example_2.pdf")

In [10]:
first_page_content = getMetaDataFromPdf("./Example_2.pdf")
json_response = prompts_and_execution(first_page_content, True)
json_response = re.sub(r'<think>.*?</think>','',json_response)

def parse_json_response(response_text):
    """
    Parses the JSON response and maps it to the CreditAgreementDetails model.
    """
    data = json.loads(response_text)  # Convert JSON string to Python dictionary
    date_obj = None
    # Normalize keys to match the Pydantic model
    mapped_details = {
        "issuer": data.get("issuer"),
        "administrative_agent": data.get("administrative_agent"),
        "underwriter": data.get("underwriter"),
        "agreement_date": data.get("agreement_date")
    }
    return CreditAgreementMetadata(**mapped_details)

parse_json_response(json_response)

CreditAgreementMetadata(issuer='JPMORGAN CHASE BANK, N.A.', administrative_agent='JPMORGAN CHASE BANK, N.A.', underwriter='JPMORGAN CHASE BANK, N.A.', agreement_date='2023-07-26')

In [11]:
json_response = re.sub(r'<think>.*</think>','',json_response)
print(json_response)
def parse_json_response(response_text):
    """
    Parses the JSON response and maps it to the CreditAgreementDetails model.
    """
    data = json.loads(response_text)  # Convert JSON string to Python dictionary
    date_obj = None
    # Normalize keys to match the Pydantic model
    mapped_details = {
        "issuer": data.get("issuer"),
        "administrative_agent": data.get("administrative_agent"),
        "underwriter": data.get("underwriter"),
        "agreement_date": data.get("agreement_date")
    }
    return CreditAgreementMetadata(**mapped_details)

parse_json_response(json_response)

{"issuer": "JPMORGAN CHASE BANK, N.A.", "administrative_agent": "JPMORGAN CHASE BANK, N.A.", "underwriter": "JPMORGAN CHASE BANK, N.A.", "agreement_date": "2023-07-26"}


CreditAgreementMetadata(issuer='JPMORGAN CHASE BANK, N.A.', administrative_agent='JPMORGAN CHASE BANK, N.A.', underwriter='JPMORGAN CHASE BANK, N.A.', agreement_date='2023-07-26')

In [12]:
chunks = prepareChunks(text_data)

PDF content stored in vault.txt with semantic chunking.


In [13]:
embeddings = prepareEmbeddings(chunks)

[92mGenerating embeddings for the vault content...[0m
Skipping invalid embedding for content: 


In [14]:
tensor_embedding = convertEmbeddingsToTensor(embeddings)

Embeddings for each line in the vault:
tensor([[ 0.5527,  0.6475, -3.3231,  ..., -1.2179, -0.5984, -0.2288],
        [ 0.1551,  1.2223, -3.8140,  ..., -0.8028, -0.9273, -1.1324],
        [ 0.0694, -0.3534, -3.4456,  ..., -0.6191, -0.5641, -1.1630],
        ...,
        [-0.1832,  0.2234, -3.0310,  ..., -1.5724, -1.4899, -0.8560],
        [ 0.8140,  1.1958, -3.4786,  ..., -1.3802, -0.8361, -0.2035],
        [ 1.0608,  0.4400, -3.2910,  ..., -1.8609, -0.9382,  0.1237]])


In [15]:
def get(query):
    return getRelevantEmbeddings(query, tensor_embedding, chunks)

In [16]:
ques='Extract all the covenant details, its terms and agreements defined in the document'
response = get(ques)

Context Pulled from Documents: 

[96m“Final Release Conditions” has the meaning assigned to such term in Section 9.14(c).
Each Borrowing (other than a conversion or continuation of any Loans) shall be deemed to constitute a representation and warranty by the Borrower on the date thereof as to the matters specified in paragraphs (a) and (b) of this Section.
“Collateral” means any and all property owned, leased or operated by a Person covered by the Collateral Documents and any and all other property of any Loan Party, now existing or hereafter acquired, that may at any time be or become subject to a security interest or Lien in favor of the Administrative Agent, on behalf of itself and the Secured Parties, pursuant to the Collateral Documents to secure the Secured Obligations; provided that the Collateral shall exclude Excluded Assets.
Section 9.05 Survival.
Notwithstanding anything herein to the contrary, no Intellectual Property that is owned by or licensed to the Borrower or its Sub

In [17]:
response

'[\n    {\n        "title": "Covenant Description",\n        "description": "Matters specified in paragraphs (a) and (b) of this Section, as a representation and warranty by the Borrower"\n    },\n    {\n        "title": "Collateral Exclusions",\n        "description": "Ex excluded Assets, pursuant to the Collateral Documents to secure the Secured Obligations are excluded from the definition of \\"Collateral\\"."\n    }\n]'

In [18]:
response = response.replace("'''", "")
try:
    json_data = json.loads(response)  # Convert text to JSON
    parsed_response = Covenant(items=json_data)  # Validate with Pydantic
    print(parsed_response)
except json.JSONDecodeError:
    print("Error: Model did not return valid JSON. Raw output:")
    print(response)
except Exception as e:
    print(f"Failed to parse response: {e}")

items=[CovenantDetail(title='Covenant Description', description='Matters specified in paragraphs (a) and (b) of this Section, as a representation and warranty by the Borrower'), CovenantDetail(title='Collateral Exclusions', description='Ex excluded Assets, pursuant to the Collateral Documents to secure the Secured Obligations are excluded from the definition of "Collateral".')]
