In [None]:
import re
import json
import pandas as pd
import pdfplumber

In [None]:
# Extract text from PDF.
# Args:
#   pdf_path (str): File path to the PDF.
# Returns:
#   string: A single string which is the concatenated version of all pages and lines in the PDF.
def extract_text_from_pdf(pdf_path: str) -> str:
    # Open up the PDF and read in each page.
    pages = pdfplumber.open(pdf_path).pages
    # List comprehension. Structure is expression FOR x IN y.
    # Execute expression on each x in y.
    # Here we extract the text from each page. Then we separate each text element with the new line character.
    alltext = "\n".join([page.extract_text(keep_blank_chars=True, layout=True, x_density = 3.9, y_density = 13) for page in pages])
    return alltext

In [None]:
# Extracts sections from the document text.
# Args:
#   text (str): The text of the document.
# Returns:
#   dict: A dictionary containing the extracted sections with the section headers as keys.
def extract_sections(text: str) -> dict[str, str]:
    # Find lines that contain 4 or more upper-case characters and/or slashes and/or hyphens (and are bookended by white space).
    # These will be the section headers.
    section_header_pattern = re.compile(r"^\s*[A-Z\s\/\-]{4,}\s*$", re.MULTILINE)

    # Find all section headers and their starting character index.
    matches = re.finditer(section_header_pattern, text)

    # Iterate through each match and find the starting character index as well as the section header.
    headers = [(match.start(), match.group().strip()) for match in matches]
    # Drop any potential headers that are just empty whitespace or are only 3 non-white space characters.
    # We need to drop the 3 non-white space character headers because there are some acronyms that are all capital lettrs
    # and are surrounded only by white space in the CASE FINANCIAL INFORMATION section. E.g., OAG and PSP.
    # We also need to drop RRRI as a header. This can show as a punishment condition.
    headers = [h for h in headers if len(h[1]) > 3 and h[1] != "RRRI"]

    # Dictionary to store sections
    sections = {}

    # Iterate over headers and extract sections.
    for i in range(len(headers)):
        start_index = headers[i][0]
        header = headers[i][1]
        # Set the end index to be the start index of the next section header (or the end of the text file).
        end_index = headers[i + 1][0] if i + 1 < len(headers) else len(text)

        # Extract section text.
        section_text = text[start_index:end_index].strip()

        # Remove the header from the section text.
        section_text = section_text[len(header):].strip()

        # Sometimes, section headers do not carry over to new pages. To capture the information which overflows onto the next page, set the criminal docket header to the previous substantive header.
        # And then remove the junk from the top of the criminal docket header.
        if "CRIMINAL DOCKET" in header and i - 2 > 0:
            header = headers[i-2][1]
            section_text_list = [line for line in section_text.split("\n") if line.strip() != ""]
            section_text = "\n".join(section_text_list[5:])

        # Reduce different versions of the same header to a single version
        if "ATTORNEY INFORMATION" in header:
            header = "ATTORNEY INFORMATION"
        elif "BAIL INFORMATION" in header:
            header = "BAIL"
            
        # Add the current section header to our dictionary of sections.
        # setdefault searches for the key in your dictionary if it exists.
        # If it does exist, it returns the value associated with the key. If it does not exist, the key is inserted with the provided default value.
        sections.setdefault(header, "")

        # Add the section text to the dictionary under the header key.
        sections[header] += f"\n{section_text}"

    return sections

In [None]:
# Extracts the defendant's information from the DEFENDANT INFORMATION section.
# Args:
#   text (str): The text containing the defendant's information.
# Return:
#   dict: A dictionary containing the extracted information.

def extract_defendant_information(text: str) -> dict[str, str | list]:
    split = text.split("\n")
    extracted_info = {}
    i = 0

    # Defendant information follows a straightforward pattern.
    # In MJ dockets, the following pattern holds:
    #   Line 1 is name and sex.
    #   Line 2 is DOB and race.
    #   Line 4 is type of address for each address (e.g., Home, Mailing, Other)
    #   Line 5 is the addresses.
    #   Line 6 is if the defendant has been advised of their right to apply for assignment of counsel.
    #   Line 7 is if the defendant requested a public defender.
    #   Line 8 is if an application has been provided for the appointment of a public defender.
    #   Line 9 is if the defendant has been finger printed.
    while(i < len(split)):
        line = split[i].lower().strip()
        if("name:" in line or "sex:" in line):
            extracted_info["name"] = line.split("name:")[1].split("sex:")[0].strip()
            extracted_info["sex"] = line.split("name:")[1].split("sex:")[1].strip()
            i += 1
        elif("date of birth:" and "race:" in line):
            extracted_info["dob"] = line.split("date of birth:")[1].split("race:")[0].strip()
            extracted_info["race"] = line.split("date of birth:")[1].split("race:")[1].strip()
            i += 1
        elif("address(es):" in line):
            extracted_info["address_type"] = split[i + 1].split()
            extracted_info["address"] = re.split("\s{2,}", split[i + 2].strip())
            #extracted_info["address"] = re.findall(r"([A-Za-z]+\s*,\s*[A-Za-z]{2}\s*[0-9]{5})", split[i + 2])
            i += 3
        elif("advised of his right to apply for assignment of counsel?" in line):
            extracted_info["counsel"] = line.split("advised of his right to apply for assignment of counsel?")[1].strip()
            i += 1
        elif("public defender requested by the defendant?" in line):
            extracted_info["defender_requested"] = line.split("public defender requested by the defendant?")[1].strip()
            i += 1
        elif("application provided for appointment of public defender?" in line):
            extracted_info["application_provided"] = line.split("application provided for appointment of public defender?")[1].strip()
            i += 1
        elif("has the defendant been fingerprinted?" in line):
            extracted_info["fingerprinted"] = line.split("has the defendant been fingerprinted?")[1].strip()
            i += 1
        # Line is a junk line. Keep moving on.
        else:
            i += 1
        
    return extracted_info

In [None]:
# Extracts the case information from the CASE INFORMATION section.
# Args:
#   text(str): The text containing the case information.
# Return:
#   dict: A dictionary containing the extracted information.
def extract_case_information(text: str) -> dict[str, str | list]:
    split = text.split("\n")
    extracted_info = {}
    i = 0

    # Case information follows a straightforward pattern.
    # In MJ dockets:
    #   Line 1 is judge assigned (optional and potentially multiple lines) issue date.
    #   Line 2 is OTN (or OTN/LOTN) and file date.
    #   Line 3 is arresting agency and arrest date.
    #   Line 4 is complaint number (or document number) and incident number.
    #   Line 5 is disposition and disposition date.
    #   Line 6 is county and township.
    #   Line 7 is case status.
    while(i < len(split)):
        line = split[i].lower().strip()
        
        if("issue date:" in line):
            extracted_info["issue_date"] = line.split("issue date:")[1].strip()
            j = 1

            if("judge assigned" in line):
                extracted_info["judge_assigned"] = line.split("judge assigned:")[1].split("issue date:")[0].strip()
                
                # Check the next line if the judge's name takes up multiple lines.
                while("otn" not in split[i + j].lower() and "file date" not in split[i + j].lower() and "otn/lotn" not in split[i + j]):
                    lookahead_line = split[i + j].lower().strip()
                    extracted_info["judge_assigned"] = extracted_info["judge_assigned"] + " " + lookahead_line
                    j += 1

            i += 1 + (j - 1)
        elif("file date:" in line):
            if(re.search("^otn:", line)):
                extracted_info["otn"] = line.split("otn:")[1].split("file date:")[0].strip()
                extracted_info["file_date"] = line.split("otn:")[1].split("file date:")[1].strip()
            elif("otn/lotn:" in line):
                extracted_info["otn_lotn"] = line.split("otn/lotn:")[1].split("file date:")[0].strip()
                extracted_info["file_date"] = line.split("otn/lotn:")[1].split("file date:")[1].strip()
            i += 1
        elif("arresting agency:" in line or "arrest date:" in line):
            extracted_info["arresting_agency"] = line.split("arresting agency:")[1].split("arrest date:")[0].strip()
            extracted_info["arrest_date"] = line.split("arresting agency:")[1].split("arrest date:")[1].strip()
            i += 1
        elif("complaint no.:" in line or "incident no.:" in line):
            if("complaint no.:" in line):
                extracted_info["complaint_nr"] = line.split("complaint no.:")[1].split("incident no.:")[0].strip()
                extracted_info["incident_nr"] = line.split("complaint no.:")[1].split("incident no.:")[1].strip()
            elif("document no.:" in line):
                extracted_info["document_nr"] = line.split("document no.:")[1].split("incident no.:")[0].strip()
                extracted_info["incident_nr"] = line.split("document no.:")[1].split("incident no.:")[1].strip()
            i += 1
        elif("disposition:" in line or "disposition date:" in line):
            extracted_info["disposition"] = line.split("disposition:")[1].split("disposition date:")[0].strip()
            extracted_info["disposition_date"] = line.split("disposition:")[1].split("disposition date:")[1].strip()
            i += 1
        elif("county:" in line or "township:" in line):
            extracted_info["county"] = line.split("county:")[1].split("township:")[0].strip()
            extracted_info["township"] = line.split("county:")[1].split("township:")[1].strip()
            i += 1
        elif("case status:" in line):
            extracted_info["case_status"] = line.split("case status:")[1].strip()
            i += 1
        # Junk line. Keep moving.
        else:
            i += 1
                
    return extracted_info

In [None]:
# Extracts status information from the STATUS INFORMATION section.
# Args:
#   text(str): The text containing the status information.
# Return:
#   dict: A dictionary containing the status information.
def extract_status_information(text:str) -> dict[str, str | list]:
    split = text.split("\n")
    extracted_info = {}
    
    # Line counter.
    i = 0
    status_nr = -1
    status_idx = "status_nr_" + str(status_nr)

    while(i < len(split)):
        line = split[i].lower()
        
        # If it's not a header row, the end of the document, or an empty line, then it is a status information row.
        if("case status" not in line and "status date" not in line and "processing status" not in line and "printed:" not in line and "recent entries made" not in line and "administrative office of penn" not in line and "docket sheet information should" not in line and "comply with the provi" not in line and "set forth in 18" not in line and "district judge" not in line and line.strip() != ""):
            status_nr += 1
            status_idx = "status_nr_" + str(status_nr)
            extracted_info[status_idx] = {}
            extracted_info[status_idx]["case_status"] = line[:35].strip()
            extracted_info[status_idx]["status_date"] = line[35:56].strip()
            extracted_info[status_idx]["processing_status"] = line[56:].strip()
        
        i += 1
    
    return(extracted_info)

In [None]:
# Extracts calendar events from the CALENDAR EVENTS section.
# Args:
#   text(str): The text containing the calendar events information.
# Return:
#   dict: A dictionary containing the calendar events information.
def extract_calendar_events(text:str) -> dict[str, str | list]:
    split = text.split("\n")
    extracted_info = {}
    
    # Line counter.
    i = 0
    event_nr = -1
    event_idx = "event_nr_" + str(event_nr)

    while(i < len(split)):
        line = split[i].lower()
        
        # If we find a date on the line (that is not at the end of the document), then it is a calendar event.
        if(re.search(r"\d{2}/\d{2}/\d{4}", line) and "printed:" not in line):
            event_nr += 1
            event_idx = "event_nr_" + str(event_nr)
            extracted_info[event_idx] = {}

            extracted_info[event_idx]["event_type"] = line[:37].strip()
            extracted_info[event_idx]["start_date"] = line[37:49].strip()
            extracted_info[event_idx]["start_time"] = line[49:65].strip()
            extracted_info[event_idx]["room"] = line[65:90].strip()
            extracted_info[event_idx]["judge"] = line[90:127].strip()
            extracted_info[event_idx]["schedule_status"] = line[127:].strip()

        # If we do not find a date, but it is not the end of the document nor is it the header row or an empty row, then it is an overflow row.
        elif("case calendar" not in line and "event type" not in line and "printed:" not in line and "recent entries" not in line and "administrative" not in line and "docket sheet" not in line and "comply" not in line and "set forth" not in line and line.strip() != ""):
            extracted_info[event_idx]["event_type"] = extracted_info[event_idx]["event_type"] + " " + line[:37].strip()
            extracted_info[event_idx]["start_date"] = extracted_info[event_idx]["start_date"] + " " + line[37:49].strip()
            extracted_info[event_idx]["start_time"] = extracted_info[event_idx]["start_time"] + " " + line[49:65].strip()
            extracted_info[event_idx]["room"] = extracted_info[event_idx]["room"] + " " + line[65:90].strip()
            extracted_info[event_idx]["judge"] = extracted_info[event_idx]["judge"] + " " + line[90:127].strip()
            extracted_info[event_idx]["schedule_status"] = extracted_info[event_idx]["schedule_status"] + " " + line[127:].strip()
        
        i += 1
    
    return(extracted_info)

In [None]:
# Extracts case participants from the CASE PARTICIPANTS section.
# Args:
#   text(str): The text containing the case participants.
# Return:
#   dict: A dictionary containing the case participant information.
def extract_case_participants(text:str) -> dict[str, str | list]:
    split = text.split("\n")
    extracted_info = {}
    
    # Line counter.
    i = 0
    participant_nr = -1
    participant_idx = "participant_nr_" + str(participant_nr)

    while(i < len(split)):
        line = split[i].lower().strip()
        
        # If it is not the end of the document nor is it the header row or an empty row, then it is a case participant.
        if("participant type" not in line and "printed:" not in line and "recent entries" not in line and "administrative" not in line and "docket sheet" not in line and "comply with the" not in line and "set forth in" not in line and "magisterial district judge" not in line and line != ""):
            participant_nr += 1
            participant_idx = "participant_nr_" + str(participant_nr)
            extracted_info[participant_idx] = {}

            extracted_info[participant_idx]["participant_type"] = line[:45].strip()
            extracted_info[participant_idx]["name"] = line[45:].strip()
        
        i += 1
    
    return(extracted_info)

In [None]:
# Extracts the charges from the CHARGES section.
# Args:
#   text(str): The text containing the charges information.
# Return:
#   dict: A dictionary containing the extracted information.
def extract_charges(text:str) -> dict[str, str | list]:
    split = text.split("\n")
    extracted_info = {}
    charge_nr = -1
    charge_nr_idx = "charge_nr_" + str(charge_nr)
    i = 0

    while(i < len(split)):
        line = split[i].lower()

        # Skip blank lines, column header line, and junk lines.
        if("offense dt." in line or line.strip() == "" or "reflected on these docket sheets" in line or "inaccurate or delayed data" in line or "docket sheet information should" in line or "not comply with the" in line or "liability as set forth" in line or "printed:" in line or "magisterial district judge" in line):
            i += 1
        # The ยง character indicates a new charge.
        elif("ยง" in line):
            charge_nr += 1
            charge_nr_idx = "charge_nr_" + str(charge_nr)
            extracted_info[charge_nr_idx] = {}

            # Each charge adheres to the following pattern.
            extracted_info[charge_nr_idx]["nr"] = line[:17].strip()
            extracted_info[charge_nr_idx]["charge"] = line[17:45].strip()
            extracted_info[charge_nr_idx]["grade"] = line[45:53].strip()
            extracted_info[charge_nr_idx]["description"] = line[53:108].strip()
            extracted_info[charge_nr_idx]["offense_date"] = line[108:121].strip()
            extracted_info[charge_nr_idx]["disposition"] = line[121:].strip()
            i += 1
        # If the line is not a header line, a blank line, a new charge, or a junk line, then it is the description from the previous charge overflowing onto a new line.
        else:
            extracted_info[charge_nr_idx]["description"] = extracted_info[charge_nr_idx]["description"] + " " + line.strip()
            i += 1
    
    return(extracted_info)

In [None]:
# Extracts the disposition / sentencing details from the DISPOSITION / SENTENCING DETAILS section.
# Args:
#   text(str): The text containing the disposition and sentencing details.
# Return:
#   dict: A dictionary containing the extracted information.
def extract_disp_sent(text:str) -> dict[str, str | list]:
    split = text.split("\n")
    extracted_info = {}
    offense_nr = -1
    offense_nr_idx = "offense_nr_" + str(offense_nr)
    i = 0

    while(i < len(split)):
        line = split[i].lower()

        # If we find a date, this line contains the disposition, the disposition date, and if the defendant was present.
        if(re.search(r"\d{2}/\d{2}/\d{4}", line) and "printed:" not in line):
            extracted_info["case_disposition"] = line[:69].strip()
            extracted_info["disposition_date"] = line[69:103].strip()
            extracted_info["defendant_present"] = line[103:].strip()
        # As long as we are not on a column header, a junk line, or the end of page AND there is at least one number, then we are on a new offense.
        elif("case disposition" not in line and "offense disposition" not in line and line.strip() != "" and "reflected on these docket sheets" not in line and "inaccurate or delayed data" not in line and "docket sheet information should" not in line and "not comply with the" not in line and "liability as set forth" not in line and "printed:" not in line and "magisterial district judge" not in line and re.search("[0-9]+", line)):
            offense_nr += 1
            offense_nr_idx = "offense_nr_" + str(offense_nr)
            extracted_info[offense_nr_idx] = {}

            extracted_info[offense_nr_idx]["offense_seq"] = line[:14].strip()
            extracted_info[offense_nr_idx]["description"] = line[14:80].strip()
            extracted_info[offense_nr_idx]["offense_disposition"] = line[80:].strip()
        # However, if there is no number, then it is a continuation of the previous line.
        elif("case disposition" not in line and "offense disposition" not in line and line.strip() != "" and "reflected on these docket sheets" not in line and "inaccurate or delayed data" not in line and "docket sheet information should" not in line and "not comply with the" not in line and "liability as set forth" not in line and "printed:" not in line and "magisterial district judge" not in line and not re.search("[0-9]+", line)):
            extracted_info[offense_nr_idx]["description"] = extracted_info[offense_nr_idx]["description"] + " " + line.strip()

        i += 1
    
    return(extracted_info)

In [46]:
# Extract attorney information from the ATTORNEY INFORMATION section.
# Args:
#   text(str): The text containing the attorney information.
# Return:
#   dict: A dictionary containing the attorney information.
def extract_attorney_info(text:str) -> dict[str, str | list]:
    split = text.split("\n")
    for line in split:
        print(line)
    extracted_info = {}
    i = 0

    # Split each line into two sides.
    lefthand_lines = [line[:69].strip().lower() for line in split]
    #for line in lefthand_lines:
        #print(line)
    righthand_lines = [line[69:].strip().lower() for line in split]
    
    # Indices for the lawyers.
    lawyer_nr = -1
    lawyer_idx = "lawyer_nr_" + str(lawyer_nr)

    # Use this to signal we are in an address block.
    address_block = False
    
    while(i < len(split)):
        l_line = lefthand_lines[i].strip().lower()
        #print(l_line)
        
        if("name:" in l_line):
            # New lawyer.
            lawyer_nr += 1
            lawyer_idx = "lawyer_nr_" + str(lawyer_nr)
            extracted_info[lawyer_idx] = {}

            # Extract name and type.
            extracted_info[lawyer_idx]["name"] = l_line.split("name:")[1].strip()
            extracted_info[lawyer_idx]["type"] = split[i-1].strip()
        elif("representing:" in l_line):
            extracted_info[lawyer_idx]["representing"] = l_line.split("representing:")[1].strip()
        elif("counsel status:" in l_line):
            extracted_info[lawyer_idx]["counsel_status"] = l_line.split("counsel status:")[1].strip()
        elif("supreme court no.:" in l_line):
            extracted_info[lawyer_idx]["supreme_court_nr"] = l_line.split("supreme court no.:")[1].strip()
        elif("phone no.:" in l_line):
            extracted_info[lawyer_idx]["phone_nr"] = l_line.split("phone no.:")[1].strip()
        elif("address:" in l_line):
            extracted_info[lawyer_idx]["address"] = l_line.split("address:")[1].strip()
            address_block = True
        # As long as we are not at the end of the page or on a blank line AND we are in the address block, collect the data and add it to the address.
        elif(l_line != "" and "reflected on these docket sheets" not in l_line and "inaccurate or delayed data" not in l_line and "docket sheet information should" not in l_line and "not comply with the" not in l_line and "liability as set forth" not in l_line and "printed:" not in l_line and "magisterial district judge" not in l_line and address_block):
            extracted_info[lawyer_idx]["address"] = extracted_info[lawyer_idx]["address"] + "|" + l_line

            # If we find a the city + state + zip code, then the address is over.
            if(re.search("[A-Za-z]+,\s*[A-Za-z]{2}\s*[0-9]{5}", l_line)):
                address_block = False

        i += 1

    while(i < len(split)):
        r_line = righthand_lines[i].strip().lower()

        if("name:" in r_line):
            # New lawyer.
            lawyer_nr += 1
            lawyer_idx = "lawyer_nr_" + str(lawyer_nr)
            extracted_info[lawyer_idx] = {}

            # Extract name and type.
            extracted_info[lawyer_idx]["name"] = r_line.split("name:")[1].strip()
            extracted_info[lawyer_idx]["type"] = split[i-1].strip()
        elif("representing:" in r_line):
            extracted_info[lawyer_idx]["representing"] = r_line.split("representing:")[1].strip()
        elif("counsel status:" in r_line):
            extracted_info[lawyer_idx]["counsel_status"] = r_line.split("counsel status:")[1].strip()
        elif("supreme court no.:" in r_line):
            extracted_info[lawyer_idx]["supreme_court_nr"] = r_line.split("supreme court no.:")[1].strip()
        elif("phone no.:" in r_line):
            extracted_info[lawyer_idx]["phone_nr"] = r_line.split("phone no.:")[1].strip()
        elif("address:" in r_line):
            extracted_info[lawyer_idx]["address"] = r_line.split("address:")[1].strip()
            address_block = True
        # As long as we are not at the end of the page or on a blank line AND we are in the address block, collect the data and add it to the address.
        elif(r_line != "" and "reflected on these docket sheets" not in r_line and "inaccurate or delayed data" not in r_line and "docket sheet information should" not in r_line and "not comply with the" not in r_line and "liability as set forth" not in r_line and "printed:" not in r_line and "magisterial district judge" not in r_line and address_block):
            extracted_info[lawyer_idx]["address"] = extracted_info[lawyer_idx]["address"] + "|" + r_line

            # If we find a the city + state + zip code, then the address is over.
            if(re.search("[A-Za-z]+,\s*[A-Za-z]{2}\s*[0-9]{5}", r_line)):
                address_block = False

        i += 1
    
    return(extracted_info)

In [None]:
def extract_all(pdf_path: str) -> dict[str, str | dict]:
    # Join together all pages and lines into one string.
    text = extract_text_from_pdf(pdf_path)

    # Partition the text by sections.
    sections = extract_sections(text)

    defendant_info = (
        # The second argument in get() is the default value returned if the key is not found in the dictionary.
        extract_defendant_information(sections.get("DEFENDANT INFORMATION", ""))
        if "DEFENDANT INFORMATION" in sections
        else None
    )

    case_info = (
        extract_case_information(sections.get("CASE INFORMATION", ""))
        if "CASE INFORMATION" in sections
        else None
    )

    status_info = (
        extract_status_information(sections.get("STATUS INFORMATION", ""))
        if "STATUS INFORMATION" in sections
        else None
    )

    calendar_events = (
        extract_calendar_events(sections.get("CALENDAR EVENTS", ""))
        if "CALENDAR EVENTS" in sections
        else None
    )

    case_participants = (
        extract_case_participants(sections.get("CASE PARTICIPANTS", ""))
        if "CASE PARTICIPANTS" in sections
        else None
    )

    charges = (
        extract_charges(sections.get("CHARGES", ""))
        if "CHARGES" in sections
        else None
    )

    disp_sent_details = (
        extract_disp_sent(sections.get("DISPOSITION / SENTENCING DETAILS", ""))
        if "DISPOSITION / SENTENCING DETAILS" in sections
        else None
    )

    attorney_info = (
        extract_attorney_info(sections.get("ATTORNEY INFORMATION", ""))
        if "ATTORNEY INFORMATION" in sections
        else None
    )

    return(
        {
            #"defendant_info": defendant_info,
            #"case_info": case_info,
            #"status_info": status_info,
            #"calendar_events": calendar_events,
            #"case_participants": case_participants,
            #"charges": charges,
            #"disp_sent_details": disp_sent_details,
            "attorney_info": attorney_info
        }
    )

In [39]:
a = extract_all("../output/pdf_sample/pdfs/ds_Allegheny_MJ_05003_CR_0000302_2014.pdf")
a2 = extract_all("../output/pdf_sample/pdfs/ds_Allegheny_MJ_05003_CR_0002806_2006.pdf")
a3 = extract_all("../output/pdf_sample/pdfs/ds_Allegheny_MJ_05003_CR_0002929_2019.pdf")
a4 = extract_all("../output/pdf_sample/pdfs/ds_Allegheny_MJ_05003_CR_0002952_2014.pdf")
a5 = extract_all("../output/pdf_sample/pdfs/ds_Allegheny_MJ_05003_CR_0005051_2019.pdf")
a6 = extract_all("../output/pdf_sample/pdfs/ds_Allegheny_MJ_05003_CR_0006269_2023.pdf")
a7 = extract_all("../output/pdf_sample/pdfs/ds_Allegheny_MJ_05003_CR_0006629_2010.pdf")
a8 = extract_all("../output/pdf_sample/pdfs/ds_Allegheny_MJ_05003_CR_0006843_2023.pdf")
a9 = extract_all("../output/pdf_sample/pdfs/ds_Allegheny_MJ_05003_CR_0007717_2010.pdf")
a10 = extract_all("../output/pdf_sample/pdfs/ds_Allegheny_MJ_05003_CR_0012491_2010.pdf")
a11 = extract_all("../output/pdf_sample/pdfs/ds_Allegheny_MJ_05203_CR_0000296_2014.pdf")
a12 = extract_all("../output/pdf_sample/pdfs/ds_Allegheny_MJ_05208_CR_0000083_2010.pdf")
a13 = extract_all("../output/pdf_sample/pdfs/ds_Allegheny_MJ_05213_CR_0000549_2019.pdf")
a14 = extract_all("../output/pdf_sample/pdfs/ds_Allegheny_MJ_05222_CR_0000351_2006.pdf")
a15 = extract_all("../output/pdf_sample/pdfs/ds_Blair_MJ_24102_CR_0000192_2014.pdf")
a16 = extract_all("../output/pdf_sample/pdfs/ds_Blair_MJ_24102_CR_0000446_2023.pdf")
a17 = extract_all("../output/pdf_sample/pdfs/ds_Blair_MJ_24102_CR_0000562_2023.pdf")
a18 = extract_all("../output/pdf_sample/pdfs/ds_Blair_MJ_24102_CR_0000701_2010.pdf")
a19 = extract_all("../output/pdf_sample/pdfs/ds_Blair_MJ_24103_CR_0000337_2014.pdf")
a20 = extract_all("../output/pdf_sample/pdfs/ds_Blair_MJ_24103_CR_0000453_2014.pdf")
a21 = extract_all("../output/pdf_sample/pdfs/ds_Blair_MJ_24103_CR_0000484_2005.pdf")
a22 = extract_all("../output/pdf_sample/pdfs/ds_Dauphin_MJ_12204_CR_0000734_2005.pdf")
a23 = extract_all("../output/pdf_sample/pdfs/ds_Montgomery_MJ_38110_CR_0000233_2005.pdf")
a24 = extract_all("../output/pdf_sample/pdfs/ds_Montgomery_MJ_38119_CR_0000257_2010.pdf")

KeyError: 'lawyer_nr_-1'

In [None]:
# Add an if-clause for cases where phone number appears first (very rare).
a18 = extract_all("../output/pdf_sample/pdfs/ds_Blair_MJ_24102_CR_0000701_2010.pdf")


Phone No.:  814-693-3010                                                        
        District Attorney                                                                                                                                    
        Name:  Blair County District Attorney's Office                       Address:      Blair Cnty Courthouse                                             
                                                                                           423 Allegheny St Ste 421                                          
        Representing:  Commonwealth of Pennsylvania                                                                                                          
                                                                                           Hollidaysburg, PA  16648                                          
        Counsel Status:  Active                                                                                                 

KeyError: 'lawyer_nr_-1'

In [None]:
#a2
#a9
#a10
#a16
#a18
#a23
#a24