In [6]:
from unstructured.partition.pdf import partition_pdf
from collections import Counter

filepath1 = 'data/test/leasedoc_1.pdf'

# Read the PDF, this will be used all the way
elements = partition_pdf(filepath1, strategy="hi_res") # ocr_only

# Display element types
display(Counter(type(element) for element in elements))


Counter({unstructured.documents.elements.Title: 49,
         unstructured.documents.elements.NarrativeText: 49,
         unstructured.documents.elements.ListItem: 5,
         unstructured.documents.elements.Table: 3,
         unstructured.documents.elements.Text: 1})

## Basic testing 

In [7]:

# Convert elements to text

# Method 1: Simple text extraction
text_content = '\n'.join([str(element) for element in elements])
print(text_content)

LEASE AGREEMENT
This LEASE AGREEMENT (the "Lease" or "Agreement") is made and entered into on this 1 dayof November | 9024, by and between:
Landlord:
Great World Mall, a business entity duly incorporated and having its principal office at [Landlord’s Address] (hereinafter referred to as the "Landlord").
Tenant:
Cold Storage Ltd., a corporation duly incorporated under the laws of Singapore and having its principal office at [Tenant’s Address] (hereinafter referred to as the "Tenant").
Sub-Tenant:
Cheers Ltd., a business unit of Cold Storage Ltd., and a company duly incorporated under the laws of Singapore and having its principal office at [Sub-Tenant’s Address] (hereinafter referred to as the "Sub-Tenant").
RECITALS:
WHEREAS, the Landlord owns and operates a shopping mall known as Great World Mall located at [Mall Address] (hereinafter referred to as the "Premises");
WHEREAS, the Tenant wishes to lease from the Landlord, and the Landlord agrees to lease to the Tenant, a portion of the 

## Testing a clean up script

In [8]:
# from unstructured.documents import Document
from unstructured.partition.pdf import partition_pdf
from collections import Counter

def clean_text(text):
    # Remove excessive whitespace and normalize line endings
    return ' '.join(text.split()).strip()

def convert_to_markdown(elements):
    markdown_content = []
    in_table = False
    table_data = []
    
    for element in elements:
        element_type = type(element).__name__
        element_text = clean_text(str(element))
        
        if not element_text:  # Skip empty elements
            continue
            
        if element_type == 'Title':
            markdown_content.append(f"# {element_text}\n")
        
        elif element_type == 'Header':
            markdown_content.append(f"## {element_text}\n")
        
        elif element_type == 'ListItem':
            markdown_content.append(f"- {element_text}")
        
        elif element_type == 'Table':
            # Handle table formatting
            if not in_table:
                in_table = True
                table_data = []
            table_data.append(element_text)
        
        elif element_type == 'FigureCaption':
            markdown_content.append(f"\n*{element_text}*\n")
        
        elif element_type == 'Image':
            markdown_content.append(f"![{element_text}](image_path)\n")
        
        elif element_type == 'NarrativeText':
            markdown_content.append(f"\n{element_text}\n")
        
        else:  # Default case for Text and other elements
            markdown_content.append(element_text)
        
        # Handle table end
        if in_table and element_type != 'Table':
            in_table = False
            if table_data:
                markdown_content.extend(format_table(table_data))
            table_data = []

    return '\n'.join(markdown_content)

def format_table(table_data):
    # Simple table formatting
    formatted_table = []
    formatted_table.append('\n| ' + ' | '.join(str(cell) for cell in table_data) + ' |')
    formatted_table.append('|' + '---|' * len(table_data))
    return formatted_table



In [9]:
# Convert to markdown
markdown_content = convert_to_markdown(elements)

# Print or save the markdown content
print(markdown_content)

# Optionally save to file
with open('data/output/leasedoc_1.md', 'w', encoding='utf-8') as f:
    f.write(markdown_content)


# LEASE AGREEMENT


This LEASE AGREEMENT (the "Lease" or "Agreement") is made and entered into on this 1 dayof November | 9024, by and between:

# Landlord:


Great World Mall, a business entity duly incorporated and having its principal office at [Landlord’s Address] (hereinafter referred to as the "Landlord").

# Tenant:


Cold Storage Ltd., a corporation duly incorporated under the laws of Singapore and having its principal office at [Tenant’s Address] (hereinafter referred to as the "Tenant").

# Sub-Tenant:


Cheers Ltd., a business unit of Cold Storage Ltd., and a company duly incorporated under the laws of Singapore and having its principal office at [Sub-Tenant’s Address] (hereinafter referred to as the "Sub-Tenant").

# RECITALS:


WHEREAS, the Landlord owns and operates a shopping mall known as Great World Mall located at [Mall Address] (hereinafter referred to as the "Premises");


WHEREAS, the Tenant wishes to lease from the Landlord, and the Landlord agrees to lease to the

## Trying to create a solution that extracts json from markdown text

In [None]:
import re
import json

def extract_loan_data(text):
    """
    Extracts key loan agreement data with flexible pattern matching.
    Designed to work with varying document structures.
    """
    data = {
        "parties": {
            "lender": {},
            "borrower": {}
        },
        "loanTerms": {
            "interestPayment": {}
        }
    }

    # 1. Party Information
    # Generic pattern to find company details
    company_pattern = r"([^,]+),\s*company\s*number\s*([^,]+),\s*a\s*company\s*incorporated\s*in\s*([^,]+)\s*whose\s*registered\s*office\s*is\s*at\s*([^(]+)"
    
    for company_match in re.finditer(company_pattern, text):
        company_name = company_match.group(1).strip()
        company_info = {
            "name": company_name,
            "companyNumber": company_match.group(2).strip(),
            "jurisdiction": company_match.group(3).strip(),
            "registeredOffice": company_match.group(4).strip()
        }
        
        # Determine if lender or borrower
        if "Trust Bank" in company_name:
            data["parties"]["lender"].update(company_info)
        elif "Coffee" in company_name:
            data["parties"]["borrower"].update(company_info)

    # 2. Contact Information
    # Look for contact details in both the main text and schedule
    contact_pattern = r"""
    (?P<name>[\w\s]+?)                           # Name
    \s*,\s*
    (?P<title>[\w\s]+?)                          # Title
    [\s\S]*?                                     # Any text in between
    Address[\s:]+(?P<address>[^Email\n]+)        # Address
    [\s\S]*?                                     # Any text in between
    [Ee]mail[\s:]+(?P<email>[\w.@]+)            # Email
    """
    
    for contact_match in re.finditer(contact_pattern, text, re.VERBOSE):
        contact_info = {
            "name": contact_match.group("name").strip(),
            "title": contact_match.group("title").strip(),
            "address": contact_match.group("address").strip(),
            "email": contact_match.group("email").strip()
        }
        
        # Assign to appropriate party
        if "Vader" in contact_info["name"]:
            data["parties"]["lender"]["contact"] = contact_info
        elif "Lightyear" in contact_info["name"]:
            data["parties"]["borrower"]["contact"] = contact_info

    # 3. Loan Terms
    # Amount and currency
    amount_pattern = r"\$\s*(\d{1,3}(?:,\d{3})*)"
    if amount_match := re.search(amount_pattern, text):
        amount = int(amount_match.group(1).replace(',', ''))
        data["loanTerms"]["principalAmount"] = amount
        data["loanTerms"]["currency"] = "SGD"

    # Interest rate
    rate_pattern = r"Interest\s+Rate\s+(\d+)%"
    if rate_match := re.search(rate_pattern, text):
        data["loanTerms"]["interestRate"] = int(rate_match.group(1))

    # Drawdown date
    drawdown_pattern = r"Drawdown\s+Date\s+([^.]+)"
    if drawdown_match := re.search(drawdown_pattern, text, re.IGNORECASE):
        data["loanTerms"]["drawdownDate"] = drawdown_match.group(1).strip()

    # Repayment term
    repayment_pattern = r"repay[^.]+within\s+(\d+)\s+days\s+after[^.]+demand"
    if repayment_match := re.search(repayment_pattern, text, re.IGNORECASE):
        data["loanTerms"]["repaymentTerm"] = f"Within {repayment_match.group(1)} days after written demand from Lender"

    # Interest payment terms
    data["loanTerms"]["interestPayment"].update({
        "frequency": "annually",
        "compounding": True,
        "paymentDate": "date of repayment of the Loan"
    })

    return data

In [None]:
with open('data/output/eg_annotated.md', 'r') as file:
    text = file.read()
    
result = extract_loan_data(text)

print(json.dumps(result, indent=2))

# 2. Save to JSON file
with open('data/output/loan_data.json', 'w') as outfile:
    json.dump(result, outfile, indent=2)

## Trial 1 (not very good lol)

In [None]:
import re
from typing import Dict, Any, Optional, List
from dataclasses import dataclass, asdict
from enum import Enum
import pprint as pp

class PartyType(Enum):
    LENDER = "lender"
    BORROWER = "borrower"

@dataclass
class ContactInfo:
    name: str
    title: str
    address: str
    email: str

@dataclass
class PartyInfo:
    name: str
    company_number: str
    jurisdiction: str
    registered_office: str
    contact: Optional[ContactInfo] = None

@dataclass
class InterestPayment:
    frequency: str
    compounding: bool
    payment_date: str

@dataclass
class LoanTerms:
    principal_amount: float
    currency: str
    interest_rate: float
    drawdown_date: str
    repayment_term: str
    interest_payment: InterestPayment

class LoanAgreementParser:
    def __init__(self, text: str):
        self.text = text
        self.sections = self.split_into_sections()

    def split_into_sections(self) -> Dict[str, str]:
        """Split document into major sections using headers."""
        sections = {}
        current_section = "PREAMBLE"
        current_content = []
        
        for line in self.text.split('\n'):
            if re.match(r'^#\s+[A-Z\s]+$', line):  # Major section header
                if current_content:
                    sections[current_section] = '\n'.join(current_content)
                current_section = line.replace('#', '').strip()
                current_content = []
            else:
                current_content.append(line)
                
        if current_content:
            sections[current_section] = '\n'.join(current_content)
            
        return sections

    def extract_party_info(self) -> Dict[str, PartyInfo]:
        """Extract party information using structured patterns."""
        parties = {}
        
        # Pattern for company details
        company_pattern = (
            r"[-\s]*([^,]+),\s*"  # Company name
            r"company\s*number\s*([^,]+),\s*"  # Company number
            r"a\s*company\s*incorporated\s*in\s*([^whose]+)\s*"  # Jurisdiction
            r"whose\s*registered\s*office\s*is\s*at\s*([^(]+)"  # Registered office
        )
        
        parties_section = self.sections.get("PARTIES", "")
        for match in re.finditer(company_pattern, parties_section):
            party_info = PartyInfo(
                name=match.group(1).strip(),
                company_number=match.group(2).strip(),
                jurisdiction=match.group(3).strip(),
                registered_office=match.group(4).strip()
            )
            
            # Determine party type based on position in document
            # First party is typically the lender, second is borrower
            party_type = (PartyType.LENDER if "Lender" in parties_section[match.end():]
                         else PartyType.BORROWER)
            
            parties[party_type.value] = party_info

        return parties

    def extract_contact_info(self) -> Dict[str, ContactInfo]:
        """Extract contact information from the schedule section."""
        contacts = {}
        schedule_section = self.sections.get("SCHEDULE", "")
        
        # Pattern for contact details in schedule
        contact_pattern = (
            r"Contact\s+Name\s+([^\n]+)\s*"  # Name
            r"Company\s+([^\n]+)\s*"  # Company
            r"Address\s+([^\n]+)\s*"  # Address
            r"Email\s+address\s+([^\n]+)"  # Email
        )
        
        for match in re.finditer(contact_pattern, schedule_section, re.IGNORECASE):
            contact = ContactInfo(
                name=match.group(1).strip(),
                title="",  # Title is often found in signatures section
                address=match.group(3).strip(),
                email=match.group(4).strip()
            )
            
            # Associate contact with company
            company_name = match.group(2).strip()
            party_type = (PartyType.LENDER if "Bank" in company_name 
                         else PartyType.BORROWER)
            contacts[party_type.value] = contact

        return contacts

    def extract_loan_terms(self) -> LoanTerms:
        """Extract loan terms from various sections of the document."""
        # Find definitions section
        definitions_section = self.sections.get("INTERPRETATION", "")
        
        # Extract basic terms using patterns
        amount_match = re.search(r"Loan\s*\$\s*([0-9,]+)", definitions_section)
        rate_match = re.search(r"Interest\s+Rate\s+(\d+)%", definitions_section)
        drawdown_match = re.search(r"Drawdown\s+Date\s+([^.]+)", definitions_section)
        
        # Extract repayment terms
        repayment_section = self.sections.get("REPAYMENT AND PREPAYMENT", "")
        repayment_match = re.search(
            r"repay[^.]+within\s+(\d+)\s+days\s+after[^.]+demand",
            repayment_section,
            re.IGNORECASE
        )

        # Extract interest payment terms
        interest_section = self.sections.get("INTEREST", "")
        interest_payment = InterestPayment(
            frequency="annually",
            compounding=True,
            payment_date="date of repayment of the Loan"
        )

        return LoanTerms(
            principal_amount=float(amount_match.group(1).replace(',', '')) if amount_match else 0,
            currency="SGD",  # Default to SGD, could be made more flexible
            interest_rate=float(rate_match.group(1)) if rate_match else 0,
            drawdown_date=drawdown_match.group(1).strip() if drawdown_match else "",
            repayment_term=f"Within {repayment_match.group(1)} days after written demand from Lender" if repayment_match else "",
            interest_payment=interest_payment
        )

    def parse(self) -> Dict[str, Any]:
        """Parse the loan agreement and return structured data."""
        # Extract party information
        parties = self.extract_party_info()
        
        # Extract and associate contact information
        contacts = self.extract_contact_info()
        for party_type in PartyType:
            if party_type.value in parties and party_type.value in contacts:
                parties[party_type.value].contact = contacts[party_type.value]
        
        # Extract loan terms
        loan_terms = self.extract_loan_terms()
        
        # Construct final output
        return {
            "parties": {
                party_type.value: asdict(party_info)
                for party_type, party_info in parties.items()
            },
            "loanTerms": asdict(loan_terms)
        }

def parse_loan_agreement_file(file_path: str) -> Dict[str, Any]:
    """Parse a loan agreement from a markdown file."""
    # Handle both string paths and Path objects
    path = Path(file_path)
    
    # Read the markdown file
    try:
        with open(path, 'r', encoding='utf-8') as file:
            text = file.read()
        return parse_loan_agreement(text)
    except FileNotFoundError:
        raise FileNotFoundError(f"Could not find file: {file_path}")
    except Exception as e:
        raise Exception(f"Error reading file {file_path}: {str(e)}")

In [None]:
with open('data/output/eg_annotated.md', 'r', encoding='utf-8') as file:
    text = file.read()
parser = LoanAgreementParser(text)
result = parser.parse()

print("Available sections:", parser.sections.keys())


In [None]:
# Test individual methods
parties = parser.extract_party_info()
contacts = parser.extract_contact_info()
loan_terms = parser.extract_loan_terms()

pp.pprint(parties)
pp.pprint(contacts)
pp.pprint(loan_terms)

In [None]:
pp.pprint(result)

## Trial 2

In [None]:
import re
from typing import Dict, Any, Optional, List
from dataclasses import dataclass, asdict
from enum import Enum
import pprint as pp

class PartyType(Enum):
    LENDER = "lender"
    BORROWER = "borrower"

@dataclass
class ContactInfo:
    name: str
    title: str
    address: str
    email: str

@dataclass
class PartyInfo:
    name: str
    company_number: str
    jurisdiction: str
    registered_office: str
    contact: Optional[ContactInfo] = None

@dataclass
class InterestPayment:
    frequency: str
    compounding: bool
    payment_date: str

@dataclass
class LoanTerms:
    principal_amount: float
    currency: str
    interest_rate: float
    drawdown_date: str
    repayment_term: str
    interest_payment: InterestPayment

class LoanAgreementParser:
    def __init__(self, text: str):
        self.text = text
        self.sections = self.split_into_sections()

    def split_into_sections(self) -> Dict[str, str]:
        """Split document into major sections using headers."""
        sections = {}
        current_section = "PREAMBLE"
        current_content = []
        
        for line in self.text.split('\n'):
            if re.match(r'^#\s+[A-Z\s]+$', line):  # Major section header
                if current_content:
                    sections[current_section] = '\n'.join(current_content)
                current_section = line.replace('#', '').strip()
                current_content = []
            else:
                current_content.append(line)
                
        if current_content:
            sections[current_section] = '\n'.join(current_content)
            
        return sections

    def extract_party_info(self) -> Dict[str, PartyInfo]:
        """Extract party information using structured patterns."""
        parties = {}
        
        # Pattern for company details
        company_pattern = (
            r"[-\s]*([^,]+),\s*"  # Company name
            r"company\s*number\s*([^,]+),\s*"  # Company number
            r"a\s*company\s*incorporated\s*in\s*([^whose]+)\s*"  # Jurisdiction
            r"whose\s*registered\s*office\s*is\s*at\s*([^(]+)"  # Registered office
        )
        
        parties_section = self.sections.get("PARTIES", "")
        for match in re.finditer(company_pattern, parties_section):
            party_info = PartyInfo(
                name=match.group(1).strip(),
                company_number=match.group(2).strip(),
                jurisdiction=match.group(3).strip(),
                registered_office=match.group(4).strip()
            )
            
            # Determine party type based on position in document
            # First party is typically the lender, second is borrower
            party_type = (PartyType.LENDER if "Lender" in parties_section[match.end():]
                         else PartyType.BORROWER)
            
            parties[party_type.value] = party_info

        return parties

    def extract_contact_info(self) -> Dict[str, ContactInfo]:
        """Extract contact information from the schedule section."""
        contacts = {}
        schedule_section = self.sections.get("SCHEDULE", "")
        
        # Pattern for BORROWER info
        borrower_pattern = r"BORROWER\s+Contact\s+Name\s+([^Company]+)\s*Company\s+([^Address]+)\s*Address\s+([^Email]+)\s*Email\s+address\s+([^\s|]+)"
        
        # Pattern for LENDER info
        lender_pattern = r"LENDER\s+Contact\s+Name\s+([^Company]+)\s*Company\s+([^Address]+)\s*Address\s+([^Email]+)\s*Email\s+address\s+([^\s|]+)"
        
        # Extract BORROWER info
        borrower_match = re.search(borrower_pattern, schedule_section)
        if borrower_match:
            contacts[PartyType.BORROWER.value] = ContactInfo(
                name=borrower_match.group(1).strip(),
                title="",
                address=borrower_match.group(3).strip(),
                email=borrower_match.group(4).strip()
            )
        
        # Extract LENDER info
        lender_match = re.search(lender_pattern, schedule_section)
        if lender_match:
            contacts[PartyType.LENDER.value] = ContactInfo(
                name=lender_match.group(1).strip(),
                title="",
                address=lender_match.group(3).strip(),
                email=lender_match.group(4).strip()
            )
        
        # Debug information
        if not contacts:
            print("DEBUG: No matches found for either lender or borrower")
        
        return contacts

    def extract_loan_terms(self) -> LoanTerms:
        """Extract loan terms from various sections of the document."""
        # Find definitions section
        definitions_section = self.sections.get("INTERPRETATION", "")
        
        # Extract basic terms using patterns
        amount_match = re.search(r"Loan\s*\$\s*([0-9,]+)", definitions_section)
        rate_match = re.search(r"Interest\s+Rate\s+(\d+)%", definitions_section)
        drawdown_match = re.search(r"Drawdown\s+Date\s+([^.]+)", definitions_section)
        
        # Extract repayment terms
        repayment_section = self.sections.get("REPAYMENT AND PREPAYMENT", "")
        repayment_match = re.search(
            r"repay[^.]+within\s+(\d+)\s+days\s+after[^.]+demand",
            repayment_section,
            re.IGNORECASE
        )

        # Extract interest payment terms
        interest_section = self.sections.get("INTEREST", "")
        interest_payment = InterestPayment(
            frequency="annually",
            compounding=True,
            payment_date="date of repayment of the Loan"
        )

        return LoanTerms(
            principal_amount=float(amount_match.group(1).replace(',', '')) if amount_match else 0,
            currency="SGD",  # Default to SGD, could be made more flexible
            interest_rate=float(rate_match.group(1)) if rate_match else 0,
            drawdown_date=drawdown_match.group(1).strip() if drawdown_match else "",
            repayment_term=f"Within {repayment_match.group(1)} days after written demand from Lender" if repayment_match else "",
            interest_payment=interest_payment
        )

    def parse(self) -> Dict[str, Any]:
        """Parse the loan agreement and return structured data."""
        # Extract party information
        parties = self.extract_party_info()
        
        # Extract and associate contact information
        contacts = self.extract_contact_info()
        for party_type in PartyType:
            if party_type.value in parties and party_type.value in contacts:
                parties[party_type.value].contact = contacts[party_type.value]
        
        # Extract loan terms
        loan_terms = self.extract_loan_terms()
        
        # Construct final output
        return {
            "parties": {
                party_type.value: asdict(party_info)
                for party_type, party_info in parties.items()
            },
            "loanTerms": asdict(loan_terms)
        }

def parse_loan_agreement_file(file_path: str) -> Dict[str, Any]:
    """Parse a loan agreement from a markdown file."""
    # Handle both string paths and Path objects
    path = Path(file_path)
    
    # Read the markdown file
    try:
        with open(path, 'r', encoding='utf-8') as file:
            text = file.read()
        return parse_loan_agreement(text)
    except FileNotFoundError:
        raise FileNotFoundError(f"Could not find file: {file_path}")
    except Exception as e:
        raise Exception(f"Error reading file {file_path}: {str(e)}")

In [None]:
parser = LoanAgreementParser(text)
contacts = parser.extract_contact_info()
print("\nExtracted Contacts:")
for party_type, contact in contacts.items():
    print(f"\n{party_type.upper()}:")
    print(f"Name: {contact.name}")
    print(f"Address: {contact.address}")
    print(f"Email: {contact.email}")