In [1]:
import re
import os
import json
import pdfplumber

In [2]:
counties = [
            "adams", "allegheny", "armstrong", "beaver", "bedford", "berks",
            "blair", "bradford", "bucks", "butler", "cambria", "cameron",
            "carbon", "centre", "chester", "clarion", "clearfield", "clinton",
            "columbia", "crawford", "cumberland", "dauphin", "delaware", "elk",
            "erie", "fayette", "forest", "franklin", "fulton", "greene",
            "huntingdon", "indiana", "jefferson", "juniata", "lackawanna",
            "lancaster", "lawrence", "lebanon", "lehigh", "luzerne", "lycoming",
            "mckean", "mercer", "mifflin", "monroe", "montgomery", "montour",
            "northampton", "northumberla", "perry", "philadelphia", "pike",
            "potter", "schuylkill", "snyder", "somerset", "sullivan",
            "susquehanna", "tioga", "union", "venango", "warren", "washington",
            "wayne", "westmoreland", "wyoming", "york"
            ]

In [92]:
#pdf_path = "../output/pdf_sample/pdfs/cs_Allegheny_MJ_05003_CR_0002676_2014.pdf"
#pdf_path = "../output/pdf_sample/pdfs/cs_Allegheny_MJ_05003_CR_0000302_2014.pdf"
pdf_path = "../output/pdf_sample/pdfs/cs_Blair_MJ_24103_CR_0000555_2005.pdf"
#pdf_path = "../output/pdf_sample/pdfs/cs_Dauphin_MJ_12201_CR_0000565_2023.pdf"
#pdf_path = "../output/pdf_sample/pdfs/cs_Dauphin_MJ_12205_CR_0000378_2023.pdf"
#pdf_path = "../output/pdf_sample/pdfs/cs_Blair_MJ_24102_CR_0000085_2019.pdf"
pdf = pdfplumber.open(pdf_path)

# Concatenate all pages into one page, so to speak.
pages = pdf.pages
page_list = [page.extract_text(layout = True, x_density = 3.9, y_density = 13).split("\n") for page in pages]
lines = [line for page in page_list for line in page]

In [25]:
poi_dict = {}

# Beginning part of every court summary in court of common pleas has a block of text with person information.
poi_start_index = [i for i, x in enumerate(lines) if "DOB:" in x][0]
poi_end_index = [i for i,x in enumerate(lines) if "court:" in x.lower()][0]
poi = lines[poi_start_index:poi_end_index]

# Name, DOB, and Sex appear on the first line.
poi_dict["name"] = poi[0].split("DOB:")[0].strip()
poi_dict["dob"] = poi[0].split("DOB:")[1].split("Sex:")[0].strip()
poi_dict["sex"] = poi[0].split("DOB:")[1].split("Sex:")[1].strip()

# Location and Eye Color appear on the second line.
poi_dict["home_location"] = poi[1].split("Eyes:")[0].strip().lower()
poi_dict["eyes"] = poi[1].split("Eyes:")[1].strip()

# Hair color is on the third line.
poi_dict["hair"] = poi[2].split("Hair:")[1].strip()

# Race is on the fourth line.
poi_dict["race"] = poi[3].split("Race:")[1].strip()

# On the fifth line, if the person has an alias, their aliases will be listed here.
# If they do not have any aliases, the PDF immediately starts the criminal history.
poi_dict["alias"] = ""
if(len(poi) > 4):
    poi_dict["alias"] = poi[4].split("Aliases:")[1].strip()

In [None]:
def extract_punishment(p_idx, p_lines):
    # Initialize starting values
    loop_through_punishment = True
    punishment_nr = -1
    punishment_nr_idx = "punishment_nr_" + str(punishment_nr)
    punishment_dict = {}

    while(loop_through_punishment):
        # If the current line is the last line, exit out of the function.
        if(p_idx < len(p_lines)):
            cur_p_line = p_lines[p_idx].lower().strip()
        else:
            break
        
        # If the current line has processing status, court, county, statewide, or a case status, then it is a new case.
        # This means we've reached the end of punishments.
        if("processing status:" in cur_p_line or "court:" in cur_p_line or "county:" in cur_p_line or "statewide" == cur_p_line or cur_p_line in counties or cur_p_line in ["active", "inactive", "closed"]):
            break
        # If we have not hit a new case, then the line is a punishment.
        else:
            punishment_nr += 1
            punishment_nr_idx = "punishment_nr_" + str(punishment_nr)
            punishment_dict[punishment_nr_idx] = {}

            punishment_dict[punishment_nr_idx] = cur_p_line[:40].strip()
            punishment_dict[punishment_nr_idx] = cur_p_line[40:73].strip()
            punishment_dict[punishment_nr_idx] = cur_p_line[73:126].strip()
            punishment_dict[punishment_nr_idx] = cur_p_line[126:].strip()
        
        # Move on to the next line.
        p_idx += 1

    return punishment_dict, p_idx

In [None]:
def extract_cases(c_idx, c_lines):
    # Set initial values.
    case_nr = -1
    case_nr_idx = "case_nr_" + str(case_nr)
    charge_nr = -1
    charge_nr_idx = "charge_nr_" + str(charge_nr)
    punishment_nr = -1
    punishment_nr_idx = "punishment_nr_" + str(punishment_nr)

    loop_through_cases = True
    statewide_flag = False
    current_court_county = ""
    current_case_status = ""

    line_increment_c = c_idx
    case_dict = {}

    while(loop_through_cases):
        # If the current line is the last line, exit out of the function.
        if(c_idx < len(c_lines)):
            cur_c_line = c_lines[c_idx].lower().strip()
        else:
            break
        
        # Set the court/county for this set of cases.
        if("court:" in cur_c_line or "county:" in cur_c_line or cur_c_line in counties):
            # Clean up the line.
            if("court:" in cur_c_line):
                new_court_county = cur_c_line.split("court:")[0].strip()
            elif("county:" in cur_c_line):
                new_court_county = cur_c_line.split("county:")[0].strip()
            elif(cur_c_line in counties):
                new_court_county = cur_c_line

            # Check if the new county/court is different from our current county/court.
            if(new_court_county != current_court_county):
                current_court_county = new_court_county

        # Set the case status for this set of cases and check that it is different from the previous case status.
        if(("closed" == cur_c_line or "inactive" == cur_c_line or "active" == cur_c_line) and cur_c_line != current_case_status):
            current_case_status = cur_c_line

        # I believe statewide cases are always at the end of the PDF so once this is turned on, it stays on.
        # I.e., all subsequent cases will always be statewide.
        if("statewide" == cur_c_line):
            statewide_flag = True

        # When we encounter processing status, we are on a new case.
        # 1st line is Docket number, processing status, and OTN.
        # 2nd line is arrest date, processing location, and disposition event date.
        # 3rd line is last action and last action date.
        # 4th line is next action and next action date.
        # 5th line (optional) is bail type, bail amount, and bail status.
        # After that, each subsequent line is a prior charge.
        if("processing status:" in cur_c_line):
            line_increment_c += 1

            case_nr += 1
            case_nr_idx = "case_" + str(case_nr)
            case_dict[case_nr_idx] = {}

            charge_nr = -1
            charge_nr_idx = "charge_nr_" + str(charge_nr)

            case_dict[case_nr_idx]["county"] = current_court_county
            case_dict[case_nr_idx]["case_status"] = current_case_status
            case_dict[case_nr_idx]["statewide"] = statewide_flag
            case_dict[case_nr_idx]["docket_number"] = cur_c_line.split("proc status:")[0].strip()

            if("OTN/LOTN:" in cur_c_line):
                case_dict[case_nr_idx]["proc_status"] = cur_c_line.split("processing status:")[1].split("otn/lotn:")[0].strip().lower()
                case_dict[case_nr_idx]["otn_lotn"] = cur_c_line.split("processing status:")[1].split("otn/lotn:")[1].strip().lower()
            elif("OTN:" in cur_c_line):
                case_dict[case_nr_idx]["proc_status"] = cur_c_line.split("processing status:")[1].split("otn:")[0].strip().lower()
                case_dict[case_nr_idx]["otn_lotn"] = cur_c_line.split("processing status:")[1].split("otn:")[1].strip().lower()
        elif("arrest date:" in cur_c_line):
            line_increment_c += 1
            case_dict[case_nr_idx]["arrest_date"] = cur_c_line[:42].split("arrest date:")[1].strip().lower()
            case_dict[case_nr_idx]["case_location"] = cur_c_line[42:88].strip().lower()
            case_dict[case_nr_idx]["disp_event_date"] = cur_c_line[88:].split("disp. event date:")[1].strip().lower()
        elif("last action:" in cur_c_line):
            line_increment_c += 1
            case_dict[case_nr_idx]["last_action"] = cur_c_line.split("last action:")[1].split("last action date:")[0].strip().lower()
            case_dict[case_nr_idx]["last_action_date"] = cur_c_line.split("last action:")[1].split("last action date:")[1].strip().lower()
        elif("next action:" in cur_c_line):
            line_increment_c += 1
            case_dict[case_nr_idx]["next_action"] = cur_c_line.split("next action:")[1].split("next action date:")[0].strip().lower()
            case_dict[case_nr_idx]["next_action_date"] = cur_c_line.split("next action:")[1].split("next action date:")[1].strip().lower()
        elif("bail type:" in cur_c_line):
            line_increment_c += 1
            case_dict[case_nr_idx]["bail_type"] = cur_c_line.split("bail type:")[1].split("bail amount:")[0].strip().lower()
            case_dict[case_nr_idx]["bail_amount"] = cur_c_line.split("bail type:")[1].split("bail amount:")[1].split("bail status:")[0].strip().lower()
            case_dict[case_nr_idx]["bail_status"] = cur_c_line.split("bail type:")[1].split("bail amount:")[1].split("bail status:")[1].strip().lower()
        elif("§" in cur_c_line):
            line_increment_c += 1
            charge_nr += 1
            charge_nr_idx = "charge_nr_" + str(charge_nr)
            case_dict[case_nr_idx][charge_nr_idx] = {}

            case_dict[case_nr_idx][charge_nr_idx]["statute"] = cur_c_line[:31]
            case_dict[case_nr_idx][charge_nr_idx]["grade"] = cur_c_line[31:42]
            case_dict[case_nr_idx][charge_nr_idx]["description"] = cur_c_line[42:88]
            case_dict[case_nr_idx][charge_nr_idx]["disposition"] = cur_c_line[88:131]
            case_dict[case_nr_idx][charge_nr_idx]["counts"] = cur_c_line[131:]
        elif("program type" in cur_c_line):
            punishment_tuple = extract_punishment(c_idx, c_lines)
            punishment_dict, line_increment_c = punishment_tuple
            case_dict[case_nr_idx].update(punishment_dict)
        # If the line does not contain any of the above characters, it's a junk line, and we can skip it.
        else:
            line_increment_c += 1

        c_idx = line_increment_c

In [None]:
def extract_sqncs_and_sntncs(s_idx, s_lines):
    # Initialize starting values.
    loop_through_sqncs_and_sntncs = True
    seq_nr = -1
    seq_nr_idx = "seq_" + str(seq_nr)
    sentence_nr = -1
    sentence_nr_idx = "sentence_" + str(sentence_nr)
    seq_dict = {}

    while(loop_through_sqncs_and_sntncs):

        # If the current line is the last line, exit out of the function.
        if(s_idx < len(s_lines)):
            cur_s_line = s_lines[s_idx].lower().strip()
        else:
            break

        # If the current line is a new set of case statuses, or a new county, function is completed.
        if((("closed" == cur_s_line or "inactive" == cur_s_line or "active" == cur_s_line or "adjudicated" == cur_s_line or cur_s_line in counties or "proc status: " in cur_s_line) and "continued" not in cur_s_line)):
            break
        # When we encounter §, it marks the beginning of a new sequence.
        elif("§" in cur_s_line):
            # Reset the sentence counter because we are on a new sequence of charges.
            sentence_nr = -1
            sentence_nr_idx = "sentence_" + str(sentence_nr)
            
            seq_nr += 1
            seq_nr_idx = "seq_" + str(seq_nr)
            seq_dict[seq_nr_idx] = {}

            # You can think of the PDF as a fixed-width data table. Hopefully, each of these values is always contained within these lengths.
            seq_dict[seq_nr_idx]["seq_num"] = cur_s_line[:11].strip()
            seq_dict[seq_nr_idx]["statute"] = cur_s_line[11:48].strip()
            seq_dict[seq_nr_idx]["grade"] = cur_s_line[48:54].strip()
            seq_dict[seq_nr_idx]["description"] = cur_s_line[54:95].strip()
            seq_dict[seq_nr_idx]["disposition"] = cur_s_line[95:].strip()

        # When we encounter "min:" or "max:", we begin capturing the sentenced punishments.
        elif("min:" in cur_s_line or "max:" in cur_s_line or re.search(r"\d{2}/\d{2}/\d{4}", cur_s_line)):
            sentence_nr += 1
            sentence_nr_idx = "sentence_" + str(sentence_nr)
            seq_dict[seq_nr_idx][sentence_nr_idx] = {}

            # You can think of the PDF as a fixed-width data table. Hopefully, each of these values is always contained within these lengths.
            seq_dict[seq_nr_idx][sentence_nr_idx]["sentence_date"] = cur_s_line[:17].strip()
            seq_dict[seq_nr_idx][sentence_nr_idx]["sentence_type"] = cur_s_line[17:43].strip()
            seq_dict[seq_nr_idx][sentence_nr_idx]["program_period"] = cur_s_line[43:74].strip()
            seq_dict[seq_nr_idx][sentence_nr_idx]["sentence_length"] = cur_s_line[74:].strip()

        # Move on to the next line.
        s_idx += 1

    return seq_dict, s_idx

In [208]:
def extract_closed_cases(c_idx, c_lines):
    # Initialize starting values.
    loop_through_closed_cases = True
    closed_dict = {}
    case_nr = -1
    case_nr_idx = "case_" + str(case_nr)
    line_increment_c = c_idx
    
    while(loop_through_closed_cases):
        
        # If we are on the last line of the PDF, we have finished all closed cases.
        if(c_idx < len(c_lines)):
            cur_c_line = c_lines[c_idx].lower().strip()
        else:
            loop_through_closed_cases = False
            continue

        # If the current line has a case status, we have finished all closed cases.
        if(("inactive" == cur_c_line or "active" == cur_c_line or "adjudicated" == cur_c_line) and "continued" not in cur_c_line):
            loop_through_closed_cases = False
        # Check if the current line is a new county.
        elif(cur_c_line in counties):
            line_increment_c += 1
            county = cur_c_line
        # If we are not on a new county or new case status, then we are on a new case.
        # 1st line: Docket Number, Proc. Status, DC Number, and OTN Number.
        # 2nd line: Arrest date, disposition date, and disposition judge.
        # 3rd line: Defense attorney
        elif("proc status: " in cur_c_line):
            # If the previous line has continued, we need to investigate.
            if("continued" in c_lines[c_idx -1].lower().strip()):
                # Check and see if the current docket # equals the docket # in our dictionary. If so, we can skip this line since we already collected this info.
                # If the current docket # does not equal the docket # in our dictionary, it is new case whose data we need to collect.
                # Also, if this is the first docket # for this set of case statues, then this is also obviously a new case.
                cur_line_docket_number = cur_c_line.split("proc status:")[0].strip()
                if(case_nr != -1 and cur_line_docket_number == closed_dict[case_nr_idx]["docket_number"]):
                    line_increment_c += 1
                    c_idx = line_increment_c
                    continue
                
            line_increment_c += 1
            case_nr += 1
            case_nr_idx = "case_" + str(case_nr)
            closed_dict[case_nr_idx] = {}
            
            closed_dict[case_nr_idx]["county"] = county
            closed_dict[case_nr_idx]["docket_number"] = cur_c_line.split("proc status:")[0].strip()
            closed_dict[case_nr_idx]["proc_status"] = cur_c_line.split("proc status:")[1].split("dc no:")[0].strip().lower()
            closed_dict[case_nr_idx]["dc_nr"] = cur_c_line.split("proc status:")[1].split("dc no:")[1].split("otn:")[0].strip().lower()
            closed_dict[case_nr_idx]["otn_nr"] = cur_c_line.split("proc status:")[1].split("dc no:")[1].split("otn:")[1].strip().lower()
        elif("arrest dt: " in cur_c_line):
            line_increment_c += 1
            closed_dict[case_nr_idx]["arrest_date"] = cur_c_line.split("arrest dt:")[1].split("disp date:")[0].strip()
            closed_dict[case_nr_idx]["disp_date"] = cur_c_line.split("arrest dt:")[1].split("disp date:")[1].split("disp judge:")[0].strip()
            closed_dict[case_nr_idx]["disp_judge"] = cur_c_line.split("arrest dt:")[1].split("disp date:")[1].split("disp judge:")[1].strip()
        elif("def atty:" in cur_c_line):
            line_increment_c += 1
            closed_dict[case_nr_idx]["def_attorney"] = cur_c_line.split("def atty:")[1].strip()
        # When we encounter §, it marks the beginning of a new sequence.
        elif("§" in cur_c_line):
            result_tuple = extract_sqncs_and_sntncs(c_idx, c_lines)
            sequence_dict, line_increment_c = result_tuple
            closed_dict[case_nr_idx].update(sequence_dict)
        # If the line does not contain any of the above characters, it's a junk line, and we can skip it.
        else:
            line_increment_c += 1

        c_idx = line_increment_c

    return closed_dict, c_idx

In [209]:
def extract_inactive_active_cases(ia_idx, ia_lines):
    # Initialize starting values.
    loop_through_ia_cases = True
    ia_dict = {}
    case_nr = -1
    case_nr_idx = "case_" + str(case_nr)
    line_increment_ia = ia_idx

    while(loop_through_ia_cases):
        # If we are on the last line of the PDF, we have finished all active/inactive/adjudicated cases.
        if(ia_idx < len(ia_lines)):
            cur_ia_line = ia_lines[ia_idx].lower().strip()
        else:
            loop_through_ia_cases = False
            continue
        # If the current line has a case status, we have finished all active/inactive/adjudicated cases.
        if((("closed" == cur_ia_line or "inactive" == cur_ia_line or "active" == cur_ia_line or "adjudicated" == cur_ia_line) and "continued" not in cur_ia_line)):
            loop_through_ia_cases = False
        # Check if the current line is a new county.
        elif(cur_ia_line in counties):
            line_increment_ia += 1
            county = cur_ia_line
        # If we are not on a new county or new case status, then we are on a new case.
        # 1st line: Docket Number, Proc. Status, DC Number, and OTN Number.
        # 2nd line: Arrest date, trial date, legacy number.
        # 3rd line: Last action, last action date, last action room.
        # 4th line: Next action, next action date, next action room.
        # Occasionally, the defense attorney will also be listed (in between the 2nd and 3rd line).
        # Also occasionally, we can get a disposition date and disposition judge on the 5th line.
        elif("proc status: " in cur_ia_line):
            # If the previous line has continued, we need to investigate.
            if("continued" in ia_lines[ia_idx - 1].lower().strip()):
                # Check and see if the current docket # equals the docket # in our dictionary. If so, we can skip this line since we already collected this info.
                # If the current docket # does not equal the docket # in our dictionary, it is new case whose data we need to collect.
                # Also, if this is the first docket # for this set of case statues, then this is also obviously a new case.
                cur_line_docket_number = cur_ia_line.split("proc status:")[0].strip()
                if(case_nr != -1 and cur_line_docket_number == ia_dict[case_nr_idx]["docket_number"]):
                    line_increment_ia += 1
                    ia_idx = line_increment_ia
                    continue
                
            line_increment_ia += 1
            case_nr += 1
            case_nr_idx = "case_" + str(case_nr)
            ia_dict[case_nr_idx] = {}
            
            ia_dict[case_nr_idx]["county"] = county
            ia_dict[case_nr_idx]["docket_number"] = cur_ia_line.split("proc status:")[0].strip()
            ia_dict[case_nr_idx]["proc_status"] = cur_ia_line.split("proc status:")[1].split("dc no:")[0].strip()
            ia_dict[case_nr_idx]["dc_nr"] = cur_ia_line.split("proc status:")[1].split("dc no:")[1].split("otn:")[0].strip()
            ia_dict[case_nr_idx]["otn_nr"] = cur_ia_line.split("proc status:")[1].split("dc no:")[1].split("otn:")[1].strip()
        elif("arrest dt: " in cur_ia_line):
            line_increment_ia += 1
            ia_dict[case_nr_idx]["arrest_date"] = cur_ia_line.split("arrest dt:")[1].split("trial dt:")[0].strip()
            ia_dict[case_nr_idx]["trial_date"] = cur_ia_line.split("trial dt:")[1].split("legacy no:")[0].strip()
            ia_dict[case_nr_idx]["legacy_number"] = cur_ia_line.split("trial dt:")[1].split("legacy no:")[1].strip()
        elif("last action: " in cur_ia_line):
            line_increment_ia += 1
            ia_dict[case_nr_idx]["last_action"] = cur_ia_line.split("last action:")[1].split("last action date:")[0].strip()
            ia_dict[case_nr_idx]["last_action_date"] = cur_ia_line.split("last action:")[1].split("last action date:")[1].split("last action room:")[0].strip()
            ia_dict[case_nr_idx]["last_action_room"] = cur_ia_line.split("last action:")[1].split("last action date:")[1].split("last action room:")[1].strip()
        elif("next action: " in cur_ia_line):
            line_increment_ia += 1
            ia_dict[case_nr_idx]["next_action"] = cur_ia_line.split("next action:")[1].split("next action date:")[0].strip()
            ia_dict[case_nr_idx]["next_action_date"] = cur_ia_line.split("next action:")[1].split("next action date:")[1].split("next action room:")[0].strip()
            ia_dict[case_nr_idx]["next_action_room"] = cur_ia_line.split("next action:")[1].split("next action date:")[1].split("next action room:")[1].strip()
        elif("def atty: " in cur_ia_line):
            line_increment_ia += 1
            ia_dict[case_nr_idx]["def_attorney"] = cur_ia_line.split("def atty:")[1].strip()
        # When we encounter §, it marks the beginning of a new sequence.
        elif("§" in cur_ia_line):
            result_tuple = extract_sqncs_and_sntncs(ia_idx, ia_lines)
            sequence_dict, line_increment_ia = result_tuple
            ia_dict[case_nr_idx].update(sequence_dict)
        elif("disp date:" in cur_ia_line):
            line_increment_ia += 1
            ia_dict[case_nr_idx]["disp_date"] = cur_ia_line.split("disp date:")[1].split("disp judge:")[0].strip()
            ia_dict[case_nr_idx]["disp_judge"] = cur_ia_line.split("disp date:")[1].split("disp judge:")[1].strip()
        # If the line does not contain any of the above characters, it's a junk line, and we can skip it.
        else:
            line_increment_ia += 1

        ia_idx = line_increment_ia

    return ia_dict, ia_idx

In [216]:
current_line_index = poi_end_index
cs_dict = {}

# Loop through the rest of the lines and capture information about an individual's criminal history.
while(current_line_index < len(lines)):
    cur_line = lines[current_line_index].lower().strip()
    new_line_index = ""
    
    # Check if the current line is a new set of case (statuses).
    if(("closed" in cur_line or "inactive" in cur_line or "active" in cur_line or "adjudicated" in cur_line) and "continued" not in cur_line):
        case_status = cur_line
        cs_dict[case_status] = {}

        # Increment the index by 1 because we want to start parsing the line following the case status line.
        if(case_status == "closed"):
            result_tuple = extract_closed_cases(current_line_index + 1, lines)
        elif(case_status == "inactive" or case_status == "active" or case_status == "adjudicated"):
            result_tuple = extract_inactive_active_cases(current_line_index + 1, lines)

        cs_dict[case_status], new_line_index = result_tuple
    
    current_line_index = new_line_index

poi_dict.update(cs_dict)

In [None]:
poi_dict

In [None]:
filename = "test.json"

with open(filename, "w") as json_file:
    json.dump(poi_dict, json_file, indent = 4)