In [12]:
import pdfplumber
import numpy as np 
import pandas as pd
import os
import time

In [13]:
"""
*ฅ^•ﻌ•^ฅ*
Input and Output Directory
*ฅ^•ﻌ•^ฅ*
"""

input_directory = "../../data/example_docketsheets_courtsummaries/"
files = os.listdir(input_directory)
# Assuming Court summaries have the following file format -> CS_ MJ-17302-CR-0000035-2015.pdf
court_summaries = [f for f in files if 'CS' in f]

output_directory = "parsed_court_summaries"
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

In [14]:
def give_indices(str_to_find,case_info):
    occurences = [x for x in case_info if str_to_find in x]
    index_occurences = [case_info.index(x) for x in occurences]
    return index_occurences

In [15]:
"""
*ฅ^•ﻌ•^ฅ*
Big Loop to process a Court Summary page by page. 
*ฅ^•ﻌ•^ฅ*
"""
# stats for file processing
start_time = time.time()
# keeping track of files processed
good_files_counter = 0
tricky_files_counter = 0

for cs_file in court_summaries:

    full_path = f"{input_directory}/{cs_file}"
    pdf = pdfplumber.open(full_path)

    pages = pdf.pages

    """
        ∧＿∧
       (｡･ω･｡)つ━☆・*。
     ⊂/    /       ・゜
      しーＪ         °。+ * 。

    Parse through each PDF, if any error comes up write it a text file.
    """
    try:
        info_dict = {}
        for i,page in enumerate(pages):
            # print(f"Parsing Page {i+1}")
            text = page.extract_text(keep_blank_chars=True,layout=True).split("\n")

            # first page with POI
            dob_index = [i for i,x in enumerate(text) if "DOB:" in x]
            closed_indices = [i for i,x in enumerate(text) if "closed" in x.lower() or "inactive" in x.lower()]
            if i == 0:
                poi_info = text[dob_index[0]:closed_indices[0]]
                # Name / DOB / SEX
                poi_info_l0 = poi_info[0].split("DOB:")
                info_dict['Name'] = poi_info_l0[0].strip()
                poi_info_l00 = poi_info_l0[1].split("Sex:")
                info_dict['DOB'] = poi_info_l00[0].strip()
                info_dict['Sex'] = poi_info_l00[1].strip()
                # LOCATION / BLANK / EYES
                poi_info_l1 = poi_info[1].split("Eyes:")
                info_dict['Location'] = poi_info_l1[0].strip()
                info_dict['Eyes'] = poi_info_l1[1].strip()
                # ALIASES (TITLE ) / BLANK /  HAIR
                poi_info_l2 = poi_info[2].split("Hair:")
                info_dict['Hair'] = poi_info_l1[1].strip()
                # # ALIAS / BLANK / RACE
                poi_info_l3 = poi_info[3].split("Race:")
                info_dict['Aliases'] = poi_info_l3[0].strip()
                info_dict['Race'] = poi_info_l3[1].strip()
                if len(poi_info) > 4:
                    # sometimes empty strings or Court information is parsed from this Alias strategy
                    # sometimes Aliases: Name will be included
                    pre_cleaned_aliases = [x.strip().split(":")[-1] for x in [info_dict["Aliases"]] + poi_info[4:] if 'Court' not in x]
                    info_dict["Aliases"] = [x for x in pre_cleaned_aliases if len(x)>2]


            case_info = text[closed_indices[0]+1:]
            closed_endings = [0] + give_indices('Proc',case_info)
            printed_endings = give_indices('Printed',case_info)

            start_index = 0
            if len(closed_endings) < 2:
                closed_endings.extend(give_indices('Recent entries',case_info))
            end_index = closed_endings[1]

            case_numbers = []
            case_information = []
            while len(closed_endings) > 0:
            # processing cases 
                
                start_index = closed_endings[0]
                if len(closed_endings) > 1:
                    end_index = closed_endings[1]-1
                else:
                    end_index = printed_endings[0]

                case_sliced = case_info[start_index:end_index]

                ## ripping out case number
                # '        MJ-26303-CR-0000170-2003 Processing Status: Completed OTN: H 647213-0       ',
                processing_status_str = 'Processing Status:'
                proc_status_str = 'Proc Status:'

                if any(filter(lambda s: processing_status_str in s, case_sliced)):
                    case_number_str = [x for x in case_sliced if processing_status_str in x]
                    case_number_index = give_indices(processing_status_str,case_number_str)[0]+1
                    case_number = case_number_str[0].split(processing_status_str)[0].strip()
                    
                elif any(filter(lambda s: proc_status_str in s, case_sliced)):
                    case_number_str = [x for x in case_sliced if proc_status_str in x]
                    case_number_index = give_indices(proc_status_str,case_number_str)[0]+1
                    case_number = case_number_str[0].split(proc_status_str)[0].strip()
                else:
                    # print("No Processing Found")
                    break
                

                # continue processing the rest of the case info
                case_sliced = case_sliced[case_number_index:]

                

                #  '        Statute          Grade Description             Disposition          Counts  ',
                column_names_cases = give_indices('Statute',case_sliced)[0]+1

                # Hacky Way to get Description Column
                # 1. Determine at what index Description occurs
                description_index = case_sliced[column_names_cases-1].index("Description")
                

                case_info_dict = {}
                sliced_case_v0 = case_sliced[:column_names_cases-1]

                for x in sliced_case_v0:
                    pre_split_v0 = x.split("  ")
                    pre_split_ws_removed_v0 = [s for s in pre_split_v0 if s]
                    for val in pre_split_ws_removed_v0:
                        vals = val.split(":")
                        if len(vals) > 1:
                            case_info_dict[vals[0].strip()] = vals[1]

                sliced_case_v1 = case_sliced[column_names_cases:]

                stat = []
                gd = []
                counts = []
                description = []
                disposition = []
                for x in sliced_case_v1:
                    pre_split = x.split("  ")
                    pre_split_ws_removed = [s for s in pre_split if s]
                    if len(pre_split_ws_removed) > 3:
                        stat.append(pre_split_ws_removed[0].strip())
                        gd.append(pre_split_ws_removed[1].strip())
                        # Descriptions generally follow the same place as the Column
                        description.append(pre_split_ws_removed[2].strip())
                        disposition.append(pre_split_ws_removed[3].strip())
                        counts.append(pre_split_ws_removed[-1].strip())
                        
                case_info_dict['Statute'] = stat
                case_info_dict['Grade Description'] = gd
                case_info_dict['Counts'] = counts
                case_info_dict['Description'] = description
                case_info_dict['Disposition'] = disposition
                case_info_dict['Status'] = text[closed_indices[0]].strip()
                case_info_dict['case_number'] = case_number

                # info_dict[case_number] = case_info_dict
                case_numbers.append(case_number)
                case_information.append(case_info_dict)

                closed_endings.pop(0)

        """
        Construct a POI table where POI information is repeated for each unique case
        """
        df_info = pd.json_normalize(info_dict)
        df_case = pd.DataFrame(np.tile(df_info,(len(set(case_numbers)),1)))
        df_case['case_number'] = list(set(case_numbers))

        """
        Merge POI table with case information
        """

        df_final = pd.merge(df_case,pd.DataFrame(case_information),how='outer',left_on='case_number',right_on='case_number')
        
        file_name = cs_file.replace(".pdf","")
        df_final.to_csv(f"{output_directory}/{file_name}.csv")

        with open("parsed_pdfs.txt", "a+") as file:
            file.write(cs_file)

        good_files_counter += 1
    except Exception as e:
        # print(e)
        with open("tricky_pdfs.txt", "a+") as file:
            file.write(cs_file)

        tricky_files_counter += 1
end_time = time.time()
print(f"{len(court_summaries)} files took {end_time-start_time} seconds to process")
print(f"{good_files_counter} ({good_files_counter/len(court_summaries):2f}%) were successfully extracted")
print(f"{tricky_files_counter} ({tricky_files_counter/len(court_summaries):2f}%) were not succesfully extracted")

10 files took 2.873115062713623 seconds to process
5 (0.500000%) were successfully extracted
5 (0.500000%) were not succesfully extracted
