In [1]:
import re
import json
import pandas as pd
import pdfplumber

In [8]:
# Extract text from PDF.
# Argument pdf_path is of type string.
# Returns a string.
def extract_text_from_pdf(pdf_path: str) -> str:
    # Open up the PDF and read in each page.
    pages = pdfplumber.open(pdf_path).pages
    # List comprehension. Structure is expression FOR x IN y.
    # Execute expression on each x in y.
    # Here we extract the text from each page. Then we separate each text element with the new line character.
    alltext = "\n".join([page.extract_text(keep_blank_chars=True, layout=True) for page in pages])
    return alltext

In [None]:
def extract_sections(text) -> dict[str, str]:
    # Find lines that contain 4 or more upper-case characters and/or slashes and/or hyphens (and are bookended by white space).
    # These will be the section headers.
    section_header_pattern = re.compile(r"^\s*[A-Z\s\/\-]{4,}\s*$", re.MULTILINE)

    # Find all section headers and their starting character index.
    matches = re.finditer(section_header_pattern, text)

    # Iterate through each match and find the starting character index as well as the section header.
    headers = [(match.start(), match.group().strip()) for match in matches]
    # Drop any potential headers that are just empty whitespace.
    headers = [h for h in headers if len(h[1]) > 0]

    # Dictionary to store sections
    sections = {}

    # Iterate over headers and extract sections.
    for i in range(len(headers)):
        start_index = headers[i][0]
        header = headers[i][1]
        # Set the end index to be the start index of the next section header (or the end of the text file).
        end_index = headers[i + 1][0] if i + 1 < len(headers) else len(text)

        # Extract section text.
        section_text = text[start_index:end_index].strip()

        # Remove the header from the section text.
        section_text = section_text[len(header):].strip()

        # Reduce different versions of the same header to a single version
        if "ATTORNEY INFORMATION" in header:
            header = "ATTORNEY INFORMATION"
        elif "BAIL INFORMATION" in header:
            header = "BAIL"

        # Add the current section header to our dictionary of sections.
        sections.setdefault(header, "")

        # TO DO PICK UP HERE ###############################################################################
        sections[header] += f"\n{section_text}"

    return sections


"""Extracts sections from the document text.

    Args:
        text (str): The text of the document.

    Returns:
        dict: A dictionary containing the extracted sections with the section headers as keys.
"""

'Extracts sections from the document text.\n\n    Args:\n        text (str): The text of the document.\n\n    Returns:\n        dict: A dictionary containing the extracted sections with the section headers as keys.\n'

In [67]:
def extract_all(pdf_path: str) -> dict[str, str | dict]:
    text = extract_text_from_pdf(pdf_path)
    sections = extract_sections(text)
    return(sections)

In [None]:
a = extract_all("../output/pdf_sample/pdfs/ds_Allegheny_CP_02_CR_0000033_2019.pdf")