In [202]:
import re
import os
import json
import pdfplumber

In [203]:
counties = [
            "adams", "allegheny", "armstrong", "beaver", "bedford", "berks",
            "blair", "bradford", "bucks", "butler", "cambria", "cameron",
            "carbon", "centre", "chester", "clarion", "clearfield", "clinton",
            "columbia", "crawford", "cumberland", "dauphin", "delaware", "elk",
            "erie", "fayette", "forest", "franklin", "fulton", "greene",
            "huntingdon", "indiana", "jefferson", "juniata", "lackawanna",
            "lancaster", "lawrence", "lebanon", "lehigh", "luzerne", "lycoming",
            "mckean", "mercer", "mifflin", "monroe", "montgomery", "montour",
            "northampton", "northumberla", "perry", "philadelphia", "pike",
            "potter", "schuylkill", "snyder", "somerset", "sullivan",
            "susquehanna", "tioga", "union", "venango", "warren", "washington",
            "wayne", "westmoreland", "wyoming", "york"
            ]

In [204]:
pdf_path = "../output/pdf_sample/pdfs/cs_Montgomery_CP_46_CR_0009245_2005.pdf"
pdf = pdfplumber.open(pdf_path)

# Concatenate all pages into one page, so to speak.
pages = pdf.pages
page_list = [page.extract_text(layout = True, x_density = 3.9, y_density = 13).split("\n") for page in pages]
lines = [line for page in page_list for line in page]

In [206]:
poi_dict = {}

# Beginning part of every court summary in court of common pleas has a block of text with person information.
poi_start_index = [i for i, x in enumerate(lines) if "DOB:" in x][0]
poi_end_index = [i for i,x in enumerate(lines) if "closed" in x.lower() or "inactive" in x.lower() or "active" in x.lower() or "adjudicated" in x.lower()][0]
poi = lines[poi_start_index:poi_end_index]

# Name, DOB, and Sex appear on the first line.
poi_dict["name"] = poi[0].split("DOB:")[0].strip()
poi_dict["dob"] = poi[0].split("DOB:")[1].split("Sex:")[0].strip()
poi_dict["sex"] = poi[0].split("DOB:")[1].split("Sex:")[1].strip()

# Location and Eye Color appear on the second line.
poi_dict["home_location"] = poi[1].split("Eyes:")[0].strip().lower()
poi_dict["eyes"] = poi[1].split("Eyes:")[1].strip()

# Alias and hair color are on the third line, but alias is blank on this line.
poi_dict["hair"] = poi[2].split("Hair:")[1].strip()

# The first alias and race are on the fourth line.
alias = poi[3].split("Race:")[0].strip()
poi_dict["race"] = poi[3].split("Race:")[1].strip()                          

# The rest of the aliases are on subsequent lines.
remainder_alias = poi[4:len(poi)]
remainder_alias = [element.strip() for element in remainder_alias]
remainder_alias.append(alias)
poi_dict["alias"] = remainder_alias

In [215]:
def extract_sqncs_and_sntncs(s_idx, s_lines):
    # Initialize starting values.
    loop_through_sqncs_and_sntncs = True
    seq_nr = -1
    seq_nr_idx = "seq_" + str(seq_nr)
    sentence_nr = -1
    sentence_nr_idx = "sentence_" + str(sentence_nr)
    seq_dict = {}

    while(loop_through_sqncs_and_sntncs):

        # If the current line is the last line, exit out of the function.
        if(s_idx < len(s_lines)):
            cur_s_line = s_lines[s_idx].lower().strip()
        else:
            break

        # If the current line is a new set of case statuses, or a new county, function is completed.
        if((("closed" == cur_s_line or "inactive" == cur_s_line or "active" == cur_s_line or "adjudicated" == cur_s_line or cur_s_line in counties or "proc status: " in cur_s_line) and "continued" not in cur_s_line)):
            break
        # When we encounter §, it marks the beginning of a new sequence.
        elif("§" in cur_s_line):
            # Reset the sentence counter because we are on a new sequence of charges.
            sentence_nr = -1
            sentence_nr_idx = "sentence_" + str(sentence_nr)
            
            seq_nr += 1
            seq_nr_idx = "seq_" + str(seq_nr)
            seq_dict[seq_nr_idx] = {}

            # You can think of the PDF as a fixed-width data table. Hopefully, each of these values is always contained within these lengths.
            seq_dict[seq_nr_idx]["seq_num"] = cur_s_line[:11].strip()
            seq_dict[seq_nr_idx]["statute"] = cur_s_line[11:48].strip()
            seq_dict[seq_nr_idx]["grade"] = cur_s_line[48:54].strip()
            seq_dict[seq_nr_idx]["description"] = cur_s_line[54:95].strip()
            seq_dict[seq_nr_idx]["disposition"] = cur_s_line[95:].strip()

        # When we encounter "min:" or "max:", we begin capturing the sentenced punishments.
        elif("min:" in cur_s_line or "max:" in cur_s_line or re.search(r"\d{2}/\d{2}/\d{4}", cur_s_line)):
            sentence_nr += 1
            sentence_nr_idx = "sentence_" + str(sentence_nr)
            seq_dict[seq_nr_idx][sentence_nr_idx] = {}

            # You can think of the PDF as a fixed-width data table. Hopefully, each of these values is always contained within these lengths.
            seq_dict[seq_nr_idx][sentence_nr_idx]["sentence_date"] = cur_s_line[:17].strip()
            seq_dict[seq_nr_idx][sentence_nr_idx]["sentence_type"] = cur_s_line[17:43].strip()
            seq_dict[seq_nr_idx][sentence_nr_idx]["program_period"] = cur_s_line[43:74].strip()
            seq_dict[seq_nr_idx][sentence_nr_idx]["sentence_length"] = cur_s_line[74:].strip()

        # Move on to the next line.=
        s_idx += 1

    return seq_dict, s_idx

In [208]:
def extract_closed_cases(c_idx, c_lines):
    # Initialize starting values.
    loop_through_closed_cases = True
    closed_dict = {}
    case_nr = -1
    case_nr_idx = "case_" + str(case_nr)
    line_increment_c = c_idx
    
    while(loop_through_closed_cases):
        
        # If we are on the last line of the PDF, we have finished all closed cases.
        if(c_idx < len(c_lines)):
            cur_c_line = c_lines[c_idx].lower().strip()
        else:
            loop_through_closed_cases = False
            continue

        # If the current line has a case status, we have finished all closed cases.
        if(("inactive" == cur_c_line or "active" == cur_c_line or "adjudicated" == cur_c_line) and "continued" not in cur_c_line):
            loop_through_closed_cases = False
        # Check if the current line is a new county.
        elif(cur_c_line in counties):
            line_increment_c += 1
            county = cur_c_line
        # If we are not on a new county or new case status, then we are on a new case.
        # 1st line: Docket Number, Proc. Status, DC Number, and OTN Number.
        # 2nd line: Arrest date, disposition date, and disposition judge.
        # 3rd line: Defense attorney
        elif("proc status: " in cur_c_line):
            # If the previous line has continued, we need to investigate.
            if("continued" in c_lines[c_idx -1].lower().strip()):
                # Check and see if the current docket # equals the docket # in our dictionary. If so, we can skip this line since we already collected this info.
                # If the current docket # does not equal the docket # in our dictionary, it is new case whose data we need to collect.
                # Also, if this is the first docket # for this set of case statues, then this is also obviously a new case.
                cur_line_docket_number = cur_c_line.split("proc status:")[0].strip()
                if(case_nr != -1 and cur_line_docket_number == closed_dict[case_nr_idx]["docket_number"]):
                    line_increment_c += 1
                    c_idx = line_increment_c
                    continue
                
            line_increment_c += 1
            case_nr += 1
            case_nr_idx = "case_" + str(case_nr)
            closed_dict[case_nr_idx] = {}
            
            closed_dict[case_nr_idx]["county"] = county
            closed_dict[case_nr_idx]["docket_number"] = cur_c_line.split("proc status:")[0].strip()
            closed_dict[case_nr_idx]["proc_status"] = cur_c_line.split("proc status:")[1].split("dc no:")[0].strip().lower()
            closed_dict[case_nr_idx]["dc_nr"] = cur_c_line.split("proc status:")[1].split("dc no:")[1].split("otn:")[0].strip().lower()
            closed_dict[case_nr_idx]["otn_nr"] = cur_c_line.split("proc status:")[1].split("dc no:")[1].split("otn:")[1].strip().lower()
        elif("arrest dt: " in cur_c_line):
            line_increment_c += 1
            closed_dict[case_nr_idx]["arrest_date"] = cur_c_line.split("arrest dt:")[1].split("disp date:")[0].strip()
            closed_dict[case_nr_idx]["disp_date"] = cur_c_line.split("arrest dt:")[1].split("disp date:")[1].split("disp judge:")[0].strip()
            closed_dict[case_nr_idx]["disp_judge"] = cur_c_line.split("arrest dt:")[1].split("disp date:")[1].split("disp judge:")[1].strip()
        elif("def atty:" in cur_c_line):
            line_increment_c += 1
            closed_dict[case_nr_idx]["def_attorney"] = cur_c_line.split("def atty:")[1].strip()
        # When we encounter §, it marks the beginning of a new sequence.
        elif("§" in cur_c_line):
            result_tuple = extract_sqncs_and_sntncs(c_idx, c_lines)
            sequence_dict, line_increment_c = result_tuple
            closed_dict[case_nr_idx].update(sequence_dict)
        # If the line does not contain any of the above characters, it's a junk line, and we can skip it.
        else:
            line_increment_c += 1

        c_idx = line_increment_c

    return closed_dict, c_idx

In [209]:
def extract_inactive_active_cases(ia_idx, ia_lines):
    # Initialize starting values.
    loop_through_ia_cases = True
    ia_dict = {}
    case_nr = -1
    case_nr_idx = "case_" + str(case_nr)
    line_increment_ia = ia_idx

    while(loop_through_ia_cases):
        # If we are on the last line of the PDF, we have finished all active/inactive/adjudicated cases.
        if(ia_idx < len(ia_lines)):
            cur_ia_line = ia_lines[ia_idx].lower().strip()
        else:
            loop_through_ia_cases = False
            continue
        # If the current line has a case status, we have finished all active/inactive/adjudicated cases.
        if((("closed" == cur_ia_line or "inactive" == cur_ia_line or "active" == cur_ia_line or "adjudicated" == cur_ia_line) and "continued" not in cur_ia_line)):
            loop_through_ia_cases = False
        # Check if the current line is a new county.
        elif(cur_ia_line in counties):
            line_increment_ia += 1
            county = cur_ia_line
        # If we are not on a new county or new case status, then we are on a new case.
        # 1st line: Docket Number, Proc. Status, DC Number, and OTN Number.
        # 2nd line: Arrest date, trial date, legacy number.
        # 3rd line: Last action, last action date, last action room.
        # 4th line: Next action, next action date, next action room.
        # Occasionally, the defense attorney will also be listed (in between the 2nd and 3rd line).
        # Also occasionally, we can get a disposition date and disposition judge on the 5th line.
        elif("proc status: " in cur_ia_line):
            # If the previous line has continued, we need to investigate.
            if("continued" in ia_lines[ia_idx - 1].lower().strip()):
                # Check and see if the current docket # equals the docket # in our dictionary. If so, we can skip this line since we already collected this info.
                # If the current docket # does not equal the docket # in our dictionary, it is new case whose data we need to collect.
                # Also, if this is the first docket # for this set of case statues, then this is also obviously a new case.
                cur_line_docket_number = cur_ia_line.split("proc status:")[0].strip()
                if(case_nr != -1 and cur_line_docket_number == ia_dict[case_nr_idx]["docket_number"]):
                    line_increment_ia += 1
                    ia_idx = line_increment_ia
                    continue
                
            line_increment_ia += 1
            case_nr += 1
            case_nr_idx = "case_" + str(case_nr)
            ia_dict[case_nr_idx] = {}
            
            ia_dict[case_nr_idx]["county"] = county
            ia_dict[case_nr_idx]["docket_number"] = cur_ia_line.split("proc status:")[0].strip()
            ia_dict[case_nr_idx]["proc_status"] = cur_ia_line.split("proc status:")[1].split("dc no:")[0].strip()
            ia_dict[case_nr_idx]["dc_nr"] = cur_ia_line.split("proc status:")[1].split("dc no:")[1].split("otn:")[0].strip()
            ia_dict[case_nr_idx]["otn_nr"] = cur_ia_line.split("proc status:")[1].split("dc no:")[1].split("otn:")[1].strip()
        elif("arrest dt: " in cur_ia_line):
            line_increment_ia += 1
            ia_dict[case_nr_idx]["arrest_date"] = cur_ia_line.split("arrest dt:")[1].split("trial dt:")[0].strip()
            ia_dict[case_nr_idx]["trial_date"] = cur_ia_line.split("trial dt:")[1].split("legacy no:")[0].strip()
            ia_dict[case_nr_idx]["legacy_number"] = cur_ia_line.split("trial dt:")[1].split("legacy no:")[1].strip()
        elif("last action: " in cur_ia_line):
            line_increment_ia += 1
            ia_dict[case_nr_idx]["last_action"] = cur_ia_line.split("last action:")[1].split("last action date:")[0].strip()
            ia_dict[case_nr_idx]["last_action_date"] = cur_ia_line.split("last action:")[1].split("last action date:")[1].split("last action room:")[0].strip()
            ia_dict[case_nr_idx]["last_action_room"] = cur_ia_line.split("last action:")[1].split("last action date:")[1].split("last action room:")[1].strip()
        elif("next action: " in cur_ia_line):
            line_increment_ia += 1
            ia_dict[case_nr_idx]["next_action"] = cur_ia_line.split("next action:")[1].split("next action date:")[0].strip()
            ia_dict[case_nr_idx]["next_action_date"] = cur_ia_line.split("next action:")[1].split("next action date:")[1].split("next action room:")[0].strip()
            ia_dict[case_nr_idx]["next_action_room"] = cur_ia_line.split("next action:")[1].split("next action date:")[1].split("next action room:")[1].strip()
        elif("def atty: " in cur_ia_line):
            line_increment_ia += 1
            ia_dict[case_nr_idx]["def_attorney"] = cur_ia_line.split("def atty:")[1].strip()
        # When we encounter §, it marks the beginning of a new sequence.
        elif("§" in cur_ia_line):
            result_tuple = extract_sqncs_and_sntncs(ia_idx, ia_lines)
            sequence_dict, line_increment_ia = result_tuple
            ia_dict[case_nr_idx].update(sequence_dict)
        elif("disp date:" in cur_ia_line):
            line_increment_ia += 1
            ia_dict[case_nr_idx]["disp_date"] = cur_ia_line.split("disp date:")[1].split("disp judge:")[0].strip()
            ia_dict[case_nr_idx]["disp_judge"] = cur_ia_line.split("disp date:")[1].split("disp judge:")[1].strip()
        # If the line does not contain any of the above characters, it's a junk line, and we can skip it.
        else:
            line_increment_ia += 1

        ia_idx = line_increment_ia

    return ia_dict, ia_idx

In [216]:
current_line_index = poi_end_index
cs_dict = {}

# Loop through the rest of the lines and capture information about an individual's criminal history.
while(current_line_index < len(lines)):
    cur_line = lines[current_line_index].lower().strip()
    new_line_index = ""
    
    # Check if the current line is a new set of case (statuses).
    if(("closed" in cur_line or "inactive" in cur_line or "active" in cur_line or "adjudicated" in cur_line) and "continued" not in cur_line):
        case_status = cur_line
        cs_dict[case_status] = {}

        # Increment the index by 1 because we want to start parsing the line following the case status line.
        if(case_status == "closed"):
            result_tuple = extract_closed_cases(current_line_index + 1, lines)
        elif(case_status == "inactive" or case_status == "active" or case_status == "adjudicated"):
            result_tuple = extract_inactive_active_cases(current_line_index + 1, lines)

        cs_dict[case_status], new_line_index = result_tuple
    
    current_line_index = new_line_index

poi_dict.update(cs_dict)

In [None]:
poi_dict

In [None]:
filename = "test.json"

with open(filename, "w") as json_file:
    json.dump(poi_dict, json_file, indent = 4)