In [None]:
import re
import json
import pandas as pd
import pdfplumber

In [None]:
# Extract text from PDF.
# Args:
#   pdf_path (str): File path to the PDF.
# Returns:
#   string: A single string which is the concatenated version of all pages and lines in the PDF.

def extract_text_from_pdf(pdf_path: str) -> str:
    # Open up the PDF and read in each page.
    pages = pdfplumber.open(pdf_path).pages
    # List comprehension. Structure is expression FOR x IN y.
    # Execute expression on each x in y.
    # Here we extract the text from each page. Then we separate each text element with the new line character.
    alltext = "\n".join([page.extract_text(keep_blank_chars=True, layout=True) for page in pages])
    return alltext

In [None]:

# Extracts sections from the document text.
# Args:
#   text (str): The text of the document.
# Returns:
#   dict: A dictionary containing the extracted sections with the section headers as keys.

def extract_sections(text: str) -> dict[str, str]:
    # Find lines that contain 4 or more upper-case characters and/or slashes and/or hyphens (and are bookended by white space).
    # These will be the section headers.
    section_header_pattern = re.compile(r"^\s*[A-Z\s\/\-]{4,}\s*$", re.MULTILINE)

    # Find all section headers and their starting character index.
    matches = re.finditer(section_header_pattern, text)

    # Iterate through each match and find the starting character index as well as the section header.
    headers = [(match.start(), match.group().strip()) for match in matches]
    # Drop any potential headers that are just empty whitespace.
    headers = [h for h in headers if len(h[1]) > 0]

    # Dictionary to store sections
    sections = {}

    # Iterate over headers and extract sections.
    for i in range(len(headers)):
        start_index = headers[i][0]
        header = headers[i][1]
        # Set the end index to be the start index of the next section header (or the end of the text file).
        end_index = headers[i + 1][0] if i + 1 < len(headers) else len(text)

        # Extract section text.
        section_text = text[start_index:end_index].strip()

        # Remove the header from the section text.
        section_text = section_text[len(header):].strip()

        # Reduce different versions of the same header to a single version
        if "ATTORNEY INFORMATION" in header:
            header = "ATTORNEY INFORMATION"
        elif "BAIL INFORMATION" in header:
            header = "BAIL"

        # Add the current section header to our dictionary of sections.
        # setdefault searches for the key in your dictionary if it exists.
        # If it does exist, it returns the value associated with the key. If it does not exist, the key is inserted with the provided default value.
        sections.setdefault(header, "")

        # Add the section text to the dictionary under the header key.
        sections[header] += f"\n{section_text}"

    return sections

In [None]:
# Extracts the defendant's information from the DEFENDANT INFORMATION section.
# Args:
#   text (str): The text containing the defendant's information.
# Return:
#   dict: A dictionary containing the extracted information.

def extract_defendant_information(text: str) -> dict[str, str | list]:
    split = text.split("\n")
    extracted_info = {}
    i = 0

    # Defendant information follows a straightforward pattern.
    # In CP dockets, there is only one line which contains defendant DOB and address.
    while(i < len(split)):
        line = split[i].lower().strip()
        if("date of birth:" in line and "city/state/zip:" in line):
            extracted_info["dob"] = line.split("date of birth:")[1].split("city/state/zip:")[0].strip()
            extracted_info["address"] = line.split("date of birth:")[1].split("city/state/zip:")[1].strip()
            i += 1
        # Line is a junk line. Keep moving on.
        else:
            i += 1
        
    return extracted_info

In [None]:
def extract_case_information(text: str) -> dict[str, str | list]:
    split = text.split("\n")
    extracted_info = {}
    i = 0

    # Case information follows a straightforward pattern.
    # In CP dockets:
    #   Line 1 (Optional and potentially multiple lines) is cross court docket numbers.
    #   Line 2 is judge assigned, date filed, and initiation date.
    #   Line 3 is OTN, LOTN, and originating docket number.
    #   Line 4 is initial and final issuing authority.
    #   Line 5 is arresting agency (potentially multiple lines) and arresting officer.
    #   Line 6 is complaint/citation number and incident number.
    #   Line 7 is county and township.
    #   Line 8 + 9 is case local number type and case local number.
    while(i < len(split)):
        line = split[i].lower().strip()
        
        if("cross court docket nos:" in line):
            extracted_info["cross_court_docker_nrs"] = line.split(",")
            j = 1

            # Check the next line for more docket numbers.
            while("judge assigned" not in split[i + j] and "date filed" not in split[i + j] and "initiation date" not in split[i + j]):
                lookahead_line = split[i + j].lower.strip()
                extracted_info["cross_court_docker_nrs"] = extracted_info["cross_court_docker_nrs"].extend(lookahead_line.split(","))
                j += 1
            
            i += 1 + (j - 1)
        elif("judge assigned:" in line or "date filed:" in line or "initiation date:" in line):
            extracted_info["judge"] = line.split("judge assigned:")[1].split("date filed:")[0].strip()
            extracted_info["date_filed"] = line.split("judge assigned:")[1].split("date filed:")[1].split("initiation date:")[0].strip()
            extracted_info["initiation_date"] = line.split("judge assigned:")[1].split("date filed:")[1].split("initiation date:")[1].strip()
            i += 1
        elif("otn:" in line or "lotn:" in line or "originating docket no:" in line):
            extracted_info["otn"] = line.split("otn:")[1].split("lotn:")[0].strip()
            extracted_info["otn"] = line.split("otn:")[1].split("lotn:")[1].split("originating docket no:")[0].strip()
            extracted_info["originating_docket_nr"] = line.split("otn:")[1].split("lotn:")[1].split("originating docket no:")[1].strip()
            i += 1
        elif("initial issuing authority:" in line or "final issuing authority:" in line):
            

        # Junk line. Keep moving.
        else:
            i += 1
                

    return extracted_info

SyntaxError: invalid syntax (1620509647.py, line 31)

In [208]:
def extract_all(pdf_path: str) -> dict[str, str | dict]:
    # Join together all pages and lines into one string.
    text = extract_text_from_pdf(pdf_path)

    # Partition the text by sections.
    sections = extract_sections(text)

    # Extract defendant information.
    defendant_info = (
        extract_defendant_information(sections.get("DEFENDANT INFORMATION", ""))
        if "DEFENDANT INFORMATION" in sections
        else None
    )

    return(
        {
            "defendant_info": defendant_info
        }
    )

In [None]:
a = extract_all("../output/pdf_sample/pdfs/ds_Allegheny_CP_02_CR_0000033_2019.pdf")
a6 = extract_all("../output/pdf_sample/pdfs/ds_Allegheny_CP_02_CR_0015558_2006.pdf")
a10 = extract_all("../output/pdf_sample/pdfs/ds_Montgomery_CP_46_CR_0000333_2019.pdf")
a11 = extract_all("../output/pdf_sample/pdfs/ds_Montgomery_CP_46_CR_0001933_2010.pdf")
a13 = extract_all("../output/pdf_sample/pdfs/ds_Allegheny_CP_02_CR_0000789_2023.pdf")

In [210]:
a

{'defendant_info': {'dob': '11/21/1998', 'address': 'duquesne, pa  15110'}}

In [211]:
a2

{'defendant_info': {'name': 'martin, gregg jourdan',
  'sex': 'male',
  'dob': '09/20/1990',
  'race': 'black',
  'address_type': ['Home'],
  'address': ['Pittsburgh, PA 15217'],
  'counsel': 'no',
  'defender_requested': 'no',
  'application_provided': 'no',
  'fingerprinted': 'no'}}

In [212]:
a3

{'defendant_info': {'name': 'nebelski, sandra',
  'sex': 'female',
  'dob': '10/11/1956',
  'race': 'white',
  'address_type': ['Home', 'Home'],
  'address': ['Altoona, PA 16602', 'Altoona, PA 16602'],
  'counsel': 'yes',
  'defender_requested': 'yes',
  'application_provided': 'yes',
  'fingerprinted': 'no'}}

In [213]:
a4

{'defendant_info': {'name': 'lowery, wayne a.',
  'sex': '',
  'dob': '07/01/1964',
  'race': '',
  'address_type': [],
  'address': [],
  'counsel': 'no',
  'defender_requested': 'yes',
  'application_provided': 'yes',
  'fingerprinted': 'no'}}

In [214]:
a5

{'defendant_info': {'name': 'murphy, yancey',
  'sex': 'male',
  'dob': '04/01/1970',
  'race': 'black',
  'address_type': ['Home'],
  'address': [],
  'counsel': 'yes',
  'defender_requested': 'no',
  'application_provided': 'no',
  'fingerprinted': 'yes'}}