In [None]:
# Extracts the defendant's information from the DEFENDANT INFORMATION section.
# Args:
#   text (str): The text containing the defendant's information.
# Return:
#   dict: A dictionary containing the extracted information.

def extract_defendant_information(text: str) -> dict[str, str | list]:
    split = text.split("\n")
    extracted_info = {}
    i = 0

    # Defendant information follows a straightforward pattern.
    # In MJ dockets, the following pattern holds:
    #   Line 1 is name and sex.
    #   Line 2 is DOB and race.
    #   Line 4 is type of address for each address (e.g., Home, Mailing, Other)
    #   Line 5 is the addresses.
    #   Line 6 is if the defendant has been advised of their right to apply for assignment of counsel.
    #   Line 7 is if the defendant requested a public defender.
    #   Line 8 is if an application has been provided for the appointment of a public defender.
    #   Line 9 is if the defendant has been finger printed.
    while(i < len(split)):
        line = split[i].lower().strip()
        if("name:" in line or "sex:" in line):
            extracted_info["name"] = line.split("name:")[1].split("sex:")[0].strip()
            extracted_info["sex"] = line.split("name:")[1].split("sex:")[1].strip()
            i += 1
        elif("date of birth:" and "race:" in line):
            extracted_info["dob"] = line.split("date of birth:")[1].split("race:")[0].strip()
            extracted_info["race"] = line.split("date of birth:")[1].split("race:")[1].strip()
            i += 1
        elif("address(es):" in line):
            extracted_info["address_type"] = split[i + 1].split()
            extracted_info["address"] = re.findall(r"([A-Za-z]+\s*,\s*[A-Za-z]{2}\s*[0-9]{5})", split[i + 2])
            i += 3
        elif("advised of his right to apply for assignment of counsel?" in line):
            extracted_info["counsel"] = line.split("advised of his right to apply for assignment of counsel?")[1].strip()
            i += 1
        elif("public defender requested by the defendant?" in line):
            extracted_info["defender_requested"] = line.split("public defender requested by the defendant?")[1].strip()
            i += 1
        elif("application provided for appointment of public defender?" in line):
            extracted_info["application_provided"] = line.split("application provided for appointment of public defender?")[1].strip()
            i += 1
        elif("has the defendant been fingerprinted?" in line):
            extracted_info["fingerprinted"] = line.split("has the defendant been fingerprinted?")[1].strip()
            i += 1
        # Line is a junk line. Keep moving on.
        else:
            i += 1
        
    return extracted_info

In [None]:
a2 = extract_all("../output/pdf_sample/pdfs/ds_Allegheny_MJ_05003_CR_0000302_2014.pdf")
a3 = extract_all("../output/pdf_sample/pdfs/ds_Blair_MJ_24103_CR_0000337_2014.pdf")
a4 = extract_all("../output/pdf_sample/pdfs/ds_Dauphin_MJ_12204_CR_0000734_2005.pdf")
a5 = extract_all("../output/pdf_sample/pdfs/ds_Allegheny_MJ_05003_CR_0002952_2014.pdf")
a7 = extract_all("../output/pdf_sample/pdfs/ds_Allegheny_MJ_05208_CR_0000083_2010.pdf")
a8 = extract_all("../output/pdf_sample/pdfs/ds_Blair_MJ_24103_CR_0000484_2005.pdf")
a9 = extract_all("../output/pdf_sample/pdfs/ds_Blair_MJ_24103_CR_0000453_2014.pdf")