In [1]:
import pdfplumber
import re
import json
import os
import argparse

In [5]:
def process_files(pdf_path, output_dir, md_filename, json_filename):
    """
    Process the PDF file and generate output files in the specified directory.
    
    Args:
        pdf_path (str): Path to the input PDF file
        output_dir (str): Directory where output files will be saved
        md_filename (str): Name for the output Markdown file
        json_filename (str): Name for the output JSON file
    """
    if not os.path.exists(pdf_path):
        raise ValueError(f"PDF file not found: {pdf_path}")
    
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # Add .md extension if not provided
    if not md_filename.endswith('.md'):
        md_filename += '.md'
    
    # Add .json extension if not provided
    if not json_filename.endswith('.json'):
        json_filename += '.json'
    
    # Define output paths with custom filenames
    output_md_path = os.path.join(output_dir, md_filename)
    json_output_path = os.path.join(output_dir, json_filename)
    
    # Extract text and metadata from the PDF
    pdf_lines_with_metadata = extract_text_from_pdf_with_metadata(pdf_path)
    
    # Convert the extracted text to Markdown
    markdown_content = convert_to_markdown(pdf_lines_with_metadata)
    
    # Save the Markdown content
    with open(output_md_path, "w", encoding="utf-8") as md_file:
        md_file.write(markdown_content)
    
    # Parse Markdown and generate JSON
    parsed_document = parse_markdown_to_json(output_md_path)
    
    # Save JSON to file
    with open(json_output_path, "w", encoding="utf-8") as json_file:
        json.dump(parsed_document, json_file, indent=4)
    
    print(f"Markdown file created at: {output_md_path}")
    print(f"JSON file created at: {json_output_path}")

# Define a function to extract text and metadata from a PDF
def extract_text_from_pdf_with_metadata(pdf_path):
    extracted_text = []
    vertical_tolerance = 5  # vertical tolerance

    with pdfplumber.open(pdf_path) as pdf:
        for page_number, page in enumerate(pdf.pages, start=1):
            # group the characters by the y0 position, with tolerance
            lines = []
            current_line = []
            current_y_pos = None

            for char in sorted(
                page.chars, key=lambda c: (c["y0"], c["x0"]), reverse=True
            ):
                y_pos = char["y0"]

                # start a new line if the tolerance is exceeded
                if (
                    current_y_pos is None
                    or abs(y_pos - current_y_pos) > vertical_tolerance
                ):
                    if current_line:
                        lines.append(current_line)
                    current_line = [char]
                    current_y_pos = y_pos
                else:
                    current_line.append(char)

            # add the last processed line
            if current_line:
                lines.append(current_line)

            # process every line
            for chars in lines:
                line_text = group_chars_into_lines(chars)
                x0 = min(char["x0"] for char in chars)
                x1 = max(char["x1"] for char in chars)
                y0 = min(char["y0"] for char in chars)
                y1 = max(char["y1"] for char in chars)

                
                extracted_text.append(
                    {
                        "text": line_text.strip(),
                        "x0": x0,
                        "x1": x1,
                        "y0": y0,
                        "y1": y1,
                        "page_width": page.width,
                    }
                )

    return extracted_text


def group_chars_into_lines(chars):
    chars = sorted(
        chars, key=lambda c: c["x0"]
    )  # sort the characters based on x position
    line_text = ""
    last_x1 = None

    for char in chars:
        if last_x1 is not None and char["x0"] - last_x1 > 2:  # threshold for spaces
            line_text += " "
        line_text += char["text"]
        last_x1 = char["x1"]

    return line_text





# Define a function to format text into Markdown based on given rules
def convert_to_markdown(lines_with_metadata):
    markdown_content = ""

    for line_metadata in lines_with_metadata:
        line = line_metadata["text"]

        if not line:
            continue

        # Mark centered, uppercase lines with #
        if is_centered_text(line_metadata) and line.isupper():
            markdown_content += f"# {line}\n\n"
        # Mark lines with the first uppercase word with ##
        elif line.split()[0].isupper():
            markdown_content += f"## {line}\n\n"
        # Mark lines starting with "1" and first letter is uppercase format with ###
        elif (
            len(line.split()) > 1
            and line.split()[0].isdigit()
            and line.split()[1].isupper()
        ):
            markdown_content += f"### {line}\n\n"
        # Mark lines starting with "1.2" format with ####
        elif (
            len(line) > 2
            and line[:3].replace(".", "").isdigit()
            and len(line) > 1
            and line[1] == "."
        ):
            markdown_content += f"#### {line}\n\n"
        # Add regular text
        else:
            markdown_content += f"{line}\n\n"

    
    return markdown_content

# Check if text is centered on the page
def is_centered_text(line_metadata):
    text_center = (line_metadata["x0"] + line_metadata["x1"]) / 2
    page_center = line_metadata["page_width"] / 2
    margin = 40  # Allowable margin for centering
    return abs(text_center - page_center) <= margin


    


def parse_markdown_to_json(md_file_path):
    # reading the Markdown file

    interest_match = ""
    with open(md_file_path, "r", encoding="utf-8") as md_file:
        lines = md_file.readlines()

    # initialize the JSON structure
    document = {
        "documentType": "",
        "parties": {"lender": {}, "borrower": {}},
        "loanTerms": {},
        "eventsOfDefault": [],
        "governingLaw": "Singapore",
    }

    # Temporary variables for processing
    current_section = None
    lender_text = ""
    borrower_text = ""
    repayment_term = ""

    
    found_lender = False
    found_borrower = False
    interest_match_found = False
    precedent_line = ""
    interpretation = False
    interest_text_found = False
    drawdown_text = ""
    drawdown_start = False
    repayment_text = ""
    repayment_start = False
    document_type = False
    # loop through each line of Markdown
    for i, line in enumerate(lines):
        line = line.strip()

        # detect "documentType"
        if line.startswith("# ") and not document_type:
            
            document["documentType"] = line[2:]
            document_type = True

        # detect section ## PARTIES
        elif line.startswith("## PARTIES"):
            current_section = "parties"
        elif current_section == "parties" and line.startswith("##"):
            current_section = None  # Terminăm secțiunea parties

        # process the text for lender and borrower
        elif current_section == "parties":

            if line.startswith("1 "):  # Text for lender
                lender_text += line[2:] + " "
                found_lender = True

                continue
            elif found_lender and line.strip():
                lender_text += line[:]
                found_lender = False

            elif line.startswith("2 "):  # Text for borrower
                borrower_text += line[2:] + " "
                found_borrower = True

                continue
            elif found_borrower and line.strip():
                borrower_text += line[:]
                found_borrower = False

        elif "### 3  INTEREST" in line:
            interest_text = ""
            current_section = "interest"

        elif (
            "### 4  REPAYMENT AND PREPAYMENT" in line and current_section == "interest"
        ):
            # End of section
            interest_text_found = True
            current_section = None

            repayment_start = True
        elif repayment_start and "#### 4.2  Prepayment:" in line:
            repayment_start = False
        elif repayment_start:
            repayment_text += line
        elif current_section == "interest":
            interest_text += line

        elif "#### 5.2  Default" in line:
            default_text = ""
            current_section = "default"
        elif (
            "any security over the assets of the Borrower is enforced" in line
            and current_section == "default"
        ):
            current_section = None
        elif current_section == "default":
            default_text += line + " "
            # Loan Terms: Loan Amount
        elif "Loan $ _____________" in line:
            document["loanTerms"]["principalAmount"] = lines[i - 1].strip()

        elif "#### 1.1  Definitions" in line:
            definitions_text = ""
            current_section = "definitions"
        elif "#### 1.2  Interpretation" in line and current_section == "definitions":
            # End of section
            interpretation = True
            current_section = None

        elif current_section == "definitions" and "Drawdown Date" in line:
            drawdown_text += line
            drawdown_start = True

        elif (
            current_section == "definitions"
            and drawdown_start
            and "Event of Default" in line
        ):
            drawdown_start = False

        elif current_section == "definitions" and drawdown_start:
            drawdown_text += line

        elif interpretation and "##" in line:

            currency_match = re.search(r"##\s*(\S+)", line)

            if currency_match:

                currency_text = currency_match.group(1).strip()

                document["loanTerms"]["currency"] = currency_text

            else:
                document["loanTerms"]["currency"] = None
            interpretation = False

        elif current_section == "definitions":

            if "Interest Rate" in line:
                interest_match += line
                interest_match_found = True
                continue
            if interest_match_found:
                interest_match += line

            # 2. Drawdown Date
            drawdown_date_match = re.search(
                r"Drawdown Date\s+(.*?)Interest Rate", definitions_text, re.S
            )
            if drawdown_date_match:
                document["loanTerms"]["drawdownDate"] = drawdown_date_match.group(
                    1
                ).strip()

        precedent_line = line
    cleaned_line = interest_match.replace("_", "").strip()
    cleaned_line = re.sub(
        r"(\d)([a-zA-Z])", r"\1 \2", cleaned_line
    )  #  add space between numbers and text

    # adjust the regex to find large numbers completely, without separating them
    principal_match = re.search(r"per annum\.\s*([\d,]+)", cleaned_line)

    if principal_match:
        # save the number found as mainAmount (without conversion to float)
        document["loanTerms"]["principalAmount"] = principal_match.group(1)
    else:
        document["loanTerms"]["principalAmount"] = None

    # find the first number for interestRate
    interest_rate_match = re.search(r"Interest Rate\s+([\d\.]+)", cleaned_line)
    if interest_rate_match:
        document["loanTerms"]["interestRate"] = float(interest_rate_match.group(1))
    else:
        document["loanTerms"]["interestRate"] = None
    if "interestPayment" not in document["loanTerms"]:
        document["loanTerms"]["interestPayment"] = {}

    if "annually" in interest_text:
        document["loanTerms"]["interestPayment"]["frequency"] = "annually"
    elif "monthly" in interest_text:
        document["loanTerms"]["interestPayment"]["frequency"] = "monthly"
    elif "daily" in interest_text:
        document["loanTerms"]["interestPayment"]["frequency"] = "daily"
    else:
        document["loan_terms"]["interestPayment"]["frequency"] = None

    if "compounding" in interest_text:
        document["loanTerms"]["interestPayment"]["compounding"] = True
    else:
        document["loanTerms"]["interestPayment"]["compounding"] = False

    payment_date_match = re.search(r"payable on\s*(.*)", interest_text, re.IGNORECASE)
    if payment_date_match:
        document["loanTerms"]["interestPayment"]["paymentDate"] = (
            payment_date_match.group(1).strip()
        )
    else:
        document["loanTerms"]["interestPayment"]["paymentDate"] = None

    propositions = []

    if_match = re.split(r"\bif\s*:\s*", default_text, flags=re.IGNORECASE)

    if len(if_match) > 1:
        propositions.append(if_match[0].strip())  # Text before "if"

        numbered_points = re.split(r"\bii\b|\biii\b", if_match[1], flags=re.IGNORECASE)
        propositions.extend([p.strip() for p in numbered_points if p.strip()])

    # summarize each sentence
    for proposition in propositions:
        if proposition:
            proposition = clean_text(proposition)

        try:
            
            document["eventsOfDefault"].append(proposition)

        except ValueError:
            
            summary = proposition

    unwanted_text = "Drawdown Date  the date that is"

    # clean the text by removing the specified part
    drawdown_text = drawdown_text.replace(unwanted_text, "").strip()

    # write the result in the document
    document["loanTerms"]["drawdownDate"] = drawdown_text

    unwanted_text = "#### 4.1  Repayment of Loan:"

    # clean the text by removing the specified part
    repayment_text = repayment_text.replace(unwanted_text, "").strip()

    # write the cleaned result in the document
    document["loanTerms"]["repaymentTerm"] = repayment_text

    def populate_party(party_text, party_dict):
        party_text = party_text.replace("_", "")

        # Search for the company name
        company_name_match = re.search(r"^(.*?)(?=,)", party_text)
        party_dict["name"] = (
            company_name_match.group(1).strip() if company_name_match else "Unknown"
        )

        # search for the company number
        company_number_match = re.search(
            r"company number (\S+)", party_text, re.IGNORECASE
        )
        # strip any characters that are not numbers or letters
        party_dict["companyNumber"] = (
            re.sub(r"[^\w]", "", company_number_match.group(1).strip())
            if company_number_match
            else "Unknown"
        )

        # search for jurisdicția
        jurisdiction_match = re.search(
            r"incorporated in (\S+)", party_text, re.IGNORECASE
        )
        party_dict["jurisdiction"] = (
            jurisdiction_match.group(1).strip() if jurisdiction_match else "Unknown"
        )

        # search for adress registered office
        registered_office_match = re.search(
            r"registered office is at (\S+)", party_text, re.IGNORECASE
        )
        party_dict["registeredOffice"] = (
            registered_office_match.group(1).strip()
            if registered_office_match
            else "Unknown"
        )

        borrower_to_lender_text = extract_borrower_to_lender_text(lines)
        lend_text = extract_lender_text(lines)

        # search for contact details

        contact_details = extract_contact_details(borrower_to_lender_text)
        document["parties"]["borrower"]["contact"] = contact_details

        contact_details_lender = extract_contact_details(lend_text)
        document["parties"]["lender"]["contact"] = contact_details_lender

        signature_count = 0  #count signature
        lender_signature = False
        borrow_signature = False
        for i, line in enumerate(lines):
            line = line.strip()

            # detect  "Signature of authorised signatory"
            if (
                "Signature of authorised signatory" in line
                or lender_signature
                or borrow_signature
            ):
                if "Signature of authorised signatory" in line:
                    signature_count += 1  # increment the counter
                if signature_count == 1 and not lender_signature:
                    lender_signature = True
                    continue

                elif signature_count == 1 and lender_signature and line.strip():
                    if "," in line:  # check if the line contains a comma
                        title = line.split(",", 1)[
                            1
                        ].strip()  # retrieve the text after the comma

                        document["parties"]["lender"]["contact"]["title"] = title
                        lender_signature = False

                elif (
                    signature_count == 2 and not borrow_signature
                ):  # The second signature is for the borrower
                    borrow_signature = True
                    continue

                elif signature_count == 2 and borrow_signature and line.strip():
                    title = line.split(",", 1)[1].strip()
                    document["parties"]["borrower"]["contact"]["title"] = title
                    borrow_signature = False

    # Populate lender and borrower
    populate_party(lender_text, document["parties"]["lender"])
    populate_party(borrower_text, document["parties"]["borrower"])

    return document

def extract_borrower_to_lender_text(lines):
    borrower_to_lender_lines = []  # store the lines in a list
    current_section = None

    for line in lines:
        line = line.strip()
        if line.startswith("## BORROWER"):
            current_section = "borrower_to_lender"
            continue
        elif line.startswith("## LENDER"):
            current_section = None
            break  # stop the draw once we reach ## LENDER

        if current_section == "borrower_to_lender":
            # add the current line to the list of lines
            borrower_to_lender_lines.append(line.replace("_", ""))

    # return the preserved lines
    return borrower_to_lender_lines


def extract_lender_text(lines):
    lender_lines = []  # stock the lines for the lender
    current_section = None

    for line in lines:
        line = line.strip()
        if line.startswith("## LENDER"):
            current_section = "lender"
            continue

        if current_section == "lender":
            # add the current line to the list of lines
            lender_lines.append(line.replace("_", ""))

    # return the preserved lines
    return lender_lines


# Function to extract Contact Name from cleaned text
def extract_contact_details(borrower_to_lender_lines):
    contact_name = "Unknown"
    address = "Unknown"
    email = "Unknown"

    # iterate through each line in the list of lines
    for line in borrower_to_lender_lines:
        # search Contact Name

        if re.search(r"Contact Name", line, re.IGNORECASE):
            contact_name_match = re.search(r"Contact Name (.*)", line, re.IGNORECASE)
            if contact_name_match:
                contact_name = contact_name_match.group(1).strip()

        # search for  Address
        elif re.search(r"^Address\b", line, re.IGNORECASE):
            address_match = re.search(r"^Address\b (.*)", line, re.IGNORECASE)
            if address_match:
                address = address_match.group(1).strip()

        # search for  Email
        elif re.search(r"Email address", line, re.IGNORECASE):
            email_match = re.search(r"Email address (.*)", line, re.IGNORECASE)
            if email_match:
                email = email_match.group(1).strip()

    # return all the information in a dictionary
    return {"name": contact_name, "address": address, "email": email}


def clean_text(text):
    
    # removing multiple spaces
    text = re.sub(r"\s+", " ", text)
    return text

In [6]:
process_files('data/test/eg_annotated.pdf', 'data/output', 'eg_annotated1_plumber.md', 'output_eg1_plumber.json')

Markdown file created at: data/output/eg_annotated1_plumber.md
JSON file created at: data/output/output_eg1_plumber.json


In [7]:
process_files('data/test/eg_annotated_cancel.pdf', 'data/output', 'eg_annotated2_plumber.md', 'output_eg2_plumber.json')

Markdown file created at: data/output/eg_annotated2_plumber.md
JSON file created at: data/output/output_eg2_plumber.json
