In [1]:
import PyPDF2
import re
import json
from openai import OpenAI
from pydantic import BaseModel, Field
from typing import Optional
from datetime import datetime

In [2]:
client = OpenAI(
    base_url='http://localhost:11434/v1',
    api_key='llama3.2:3b'
)

class CreditAgreementDetails(BaseModel):
    issuer: Optional[str] = None
    administrative_agent: Optional[str] = None
    underwriter: Optional[str] = None
    agreement_date: Optional[str] = Field(None, description="Agreement date in YYYY-MM-DD format")

In [3]:
def clean_text(text):
    # Remove URLs and SEC archive references
    text = re.sub(r'https?://\S+|sec\.gov/Archives/\S+', '', text)
    # Remove metadata like EX-10.1 and timestamps
    text = re.sub(r'EX-\d+\.\d+.*', '', text)
    text = re.sub(r'\d+/\d+/\d+, \d+:\d+ \w+', '', text)
    # Remove trailing page numbers (e.g., 1/274)
    text = re.sub(r'\b\d+/\d+\b', '', text)
    # Remove extra spaces and newlines
    text = re.sub(r'\s+', ' ', text).strip()
    return text
    
def getDataFromPdf(pdf_path):
    pdf_reader = PyPDF2.PdfReader(pdf_path)
    text = pdf_reader.pages[0].extract_text() if pdf_reader.pages else None
    return clean_text(text)

first_page_content = getDataFromPdf("./Example_2.pdf")

In [7]:
# # Define the structured prompt
# prompt = f"""
# Extract the following details from the given document and I strictly needs only with the below keys and return the value as empty string("") if no information is provided for the particular key and respond in json format:

# issuer: (Who is the issuer of the agreement?)
# administrative_agent: (Who is the administrative agent?)
# underwriter: (Who is the underwriter or bookrunner or lead arranger?)
# agreement_date: (What is the agreement date? in the format of  YYYY-MM-DD)

# Document:
# {first_page_content}
# """

prompt = f"""
### System Instruction:  
You are an AI model tasked with extracting specific financial details from documents. **Follow these instructions precisely:**  
- Return the output in **strict JSON format** with the specified keys.  
- If a key's information is missing, return an **empty string ("")** instead of omitting the key.  
- Do **not** add extra text, explanations, or additional keys.  

### **Extraction Task:**  
Extract the following details from the provided document:  

- **issuer**: The entity issuing the agreement.  
- **administrative_agent**: The administrative agent managing the agreement.  
- **underwriter**: The underwriter, bookrunner, or lead arranger.  
- **agreement_date**: The agreement date in **YYYY-MM-DD** format.  

### **Document Content:**  
{first_page_content}
"""

In [8]:
print(prompt)


### System Instruction:  
You are an AI model tasked with extracting specific financial details from documents. **Follow these instructions precisely:**  
- Return the output in **strict JSON format** with the specified keys.  
- If a key's information is missing, return an **empty string ("")** instead of omitting the key.  
- Do **not** add extra text, explanations, or additional keys.  

### **Extraction Task:**  
Extract the following details from the provided document:  

- **issuer**: The entity issuing the agreement.  
- **administrative_agent**: The administrative agent managing the agreement.  
- **underwriter**: The underwriter, bookrunner, or lead arranger.  
- **agreement_date**: The agreement date in **YYYY-MM-DD** format.  

### **Document Content:**  
Exhibit 10.1 EXECUTION VERSION CREDIT AGREEMENT dated as of July 26, 2023 among HARMONY BIOSCIENCES HOLDINGS, INC., as Borrower The Lenders Party Hereto JPMORGAN CHASE BANK, N.A. as Administrative Agent and JPMORGAN CHASE 

In [9]:
# Send the structured prompt to Ollama
response = client.chat.completions.create(
    model="llama3.2:3b",  # Ensure this matches your model name
    messages=[{"role": "user", "content": prompt}]
)

# Extract the text response from the model
response_text = response.choices[0].message.content
print(response_text)

{"issuer": "HARMONY BIOSCIENCES HOLDINGS, INC.", "administrative_agent": "JPMORGAN CHASE BANK, N.A.", "underwriter": "JPMORGAN CHASE BANK, N.A.", "agreement_date": "2023-07-26"}


In [10]:
def parse_json_response(json_response):
    """
    Parses the JSON response and maps it to the CreditAgreementDetails model.
    """
    data = json.loads(json_response)  # Convert JSON string to Python dictionary
    date_obj = None
    # Normalize keys to match the Pydantic model
    mapped_details = {
        "issuer": data.get("issuer"),
        "administrative_agent": data.get("administrative_agent"),
        "underwriter": data.get("underwriter"),
        "agreement_date": data.get("agreement_date")
    }
    return CreditAgreementDetails(**mapped_details)

parse_json_response(response_text)

CreditAgreementDetails(issuer='HARMONY BIOSCIENCES HOLDINGS, INC.', administrative_agent='JPMORGAN CHASE BANK, N.A.', underwriter='JPMORGAN CHASE BANK, N.A.', agreement_date='2023-07-26')