In [1]:
%load_ext rich
from ipywidgets import interact
from docket_extract import *
import io
import itertools
import os
import glob
import pandas as pd

In [2]:
ds_dir = "../../data/example_docketsheets_courtsummaries/"
fnames = list(sorted(glob.glob(os.path.join(ds_dir, "*DS_*pdf"))))
fnames

In [3]:
sectioned = {fname: extract_sections(extract_text_from_pdf(fname)) for fname in fnames}

In [4]:
all_keys = set(itertools.chain(*[list(d.keys()) for d in sectioned.values()]))
key_count = {k: sum([k in d for d in sectioned.values()]) for k in all_keys}
list(sorted(key_count.items(), key=lambda x: x[1], reverse=True))

In [5]:
[print(sectioned[fname]["DEFENDANT INFORMATION"]) for fname in fnames]

Name:         Young, Larry Lewis       Sex:         Male                        
    Date of Birth: 08/01/1951              Race:        White                       
    Address(es):                                                                    
    Home                                                                            
    Watsontown, PA 17777                                                            
    Advised of His Right to Apply for Assignment of Counsel? No                     
    Public Defender Requested by the Defendant? No                                  
    Application Provided for Appointment of Public Defender? Yes                    
    Has the Defendant Been Fingerprinted? Yes
Date Of Birth:  10/16/1950 City/State/Zip:  Roaring Spring, PA  16673
Date Of Birth:  06/14/1959 City/State/Zip:  Oil City, PA  16301
Date Of Birth:  09/28/1988 City/State/Zip:  Philadelphia, PA  19120
Name:         Fernacz, Richard         Sex:         Male                    

In [6]:

def collect_defendant_information(text: str) -> dict[str, str | None]:
    """Extracts the defendant's information from the given text.
        
    Args:
        text (str): The text containing the defendant's information.
        
    Returns:
        dict: A dictionary containing the extracted information with the following keys:
        - "dob" (str or None): The date of birth of the defendant.
        - "race" (str or None): The race of the defendant.
        - "sex" (str or None): The sex of the defendant.
        - "counsel_advised" (str or None): Whether the defendant was advised of their right to apply for assignment of counsel.
        - "defender_requested" (str or None): Whether the defendant requested a public defender.
        - "application_provided" (str or None): Whether an application was provided for the appointment of a public defender.
        - "was_fingerprinted" (str or None): Whether the defendant has been fingerprinted.
    """
    # Define regular expressions to extract the required information
    dob_pattern = r"Date of Birth:\s*([\d/]+)"
    race_pattern = r"Race:\s*(\w+)"
    sex_pattern = r"Sex:\s*(\w+)"
    counsel_pattern = r"Advised of His Right to Apply for Assignment of Counsel\?\s*(\w+)"
    defender_requested_pattern = r"Public Defender Requested by the Defendant\?\s*(\w+)"
    application_provided_pattern = r"Application Provided for Appointment of Public Defender\?\s*(\w+)"
    fingerprinted_pattern = r"Has the Defendant Been Fingerprinted\?\s*(\w+)"

    # Extract the information using the regular expressions
    dob_match = re.search(dob_pattern, text, re.IGNORECASE)
    race_match = re.search(race_pattern, text, re.IGNORECASE)
    sex_match = re.search(sex_pattern, text, re.IGNORECASE)
    counsel_match = re.search(counsel_pattern, text, re.IGNORECASE)
    defender_requested_match = re.search(defender_requested_pattern, text, re.IGNORECASE)
    application_provided_match = re.search(application_provided_pattern, text, re.IGNORECASE)
    fingerprinted_match = re.search(fingerprinted_pattern, text, re.IGNORECASE)

    # Get the matched groups
    dob = dob_match.group(1) if dob_match else None
    race = race_match.group(1) if race_match else None
    sex = sex_match.group(1) if sex_match else None
    counsel = counsel_match.group(1) if counsel_match else None
    defender_requested = defender_requested_match.group(1) if defender_requested_match else None
    application_provided = application_provided_match.group(1) if application_provided_match else None
    fingerprinted = fingerprinted_match.group(1) if fingerprinted_match else None

    # Store the extracted information in a dictionary
    extracted_info = {
        "dob": dob,
        "race": race,
        "sex": sex,
        "counsel_advised": counsel,
        "defender_requested": defender_requested,
        "application_provided": application_provided,
        "was_fingerprinted": fingerprinted
    }

    # Print the extracted information
    return extracted_info

In [7]:
[collect_defendant_information(sectioned[fname]["DEFENDANT INFORMATION"]) for fname in fnames]

In [8]:
sectioned[fnames[1]]["DEFENDANT INFORMATION"]

In [9]:
for fname in fnames:
    print(fname)
    print(sectioned[fname]["CHARGES"]) 
    print()

../../data/example_docketsheets_courtsummaries/DS_ MJ-17302-CR-0000035-2015.pdf
#Charge        Grade Description                 Offense Dt. Disposition   
         175 § 3802 §§ A1* M DUI: Gen Imp/Inc of Driving Safely - 1st Off 03/19/2015 Waived for Court
         275 § 3802 §§ B* M  DUI: High Rte of Alc (Bac.10 - <.16) 1st Off 03/19/2015 Waived for Court

../../data/example_docketsheets_courtsummaries/DS_CP-05-CR-0000151-2006.pdf
Seq. Orig Seq. Grade Statute  Statute Description      Offense Dt. OTN       
        1    2     M1   75 § 3802 §§ C** DUI: Highest Rte of Alc (BAC .16+) 2nd 01/23/2006 K 367492-6

../../data/example_docketsheets_courtsummaries/DS_CP-27-CR-0000035-2005.pdf
Seq. Orig Seq. Grade Statute  Statute Description      Offense Dt. OTN       
        1    1     M2   18 § 3928 §§ A Unauth Use Motor/Other Vehicles 02/14/2005 H 029661-2
        3    3     M2   18 § 3921 §§ A Theft By Unlaw Taking-Movable Prop 02/14/2005 H 029661-2
        5    5     M2   18 § 3925 §§ A 

In [None]:
import re

columns = ["Seq.", "Orig Seq.", "Grade",  "Statute", "Statute Description", "Offense Dt.",  "OTN"]
def extract_charges_MC(text):
    pattern = re.compile(
        r'(?P<Seq>\d+)\s+(?P<Orig_Seq>\d+)\s+(?P<Grade>\w*)\s+(?P<Statute>\d+\s§\s\d+(?:\s§§\s\w*\**)?|\d+\s§\s\d+)\s+(?P<Statute_Description>.+?)\s+(?P<Offense_Dt>\d{2}/\d{2}/\d{4})\s+(?P<OTN>\w+\s\d+-\d+)',
        re.MULTILINE 
    )
    matches = pattern.findall(text)
    return pd.DataFrame(matches, columns=["Seq", "Orig Seq", "Grade", "Statute", "Statute Description", "Offense Dt.", "OTN"])

def extract_charges_MJ(text):
    pattern = re.compile(
        r'(?P<Num>\d)(?P<Charge>\d+\s§\s\d+(?:\s§§\s\w*\**)?(?:\s\w*\s*)?)\s+(?P<Grade>\w*)\s+(?P<Description>.+?)\s+(?P<Offense_Dt>\d{2}/\d{2}/\d{4})\s+(?P<Disposition>.+)',
        re.MULTILINE
    )
    matches = pattern.findall(text)
    return pd.DataFrame(matches, columns=["#", "Charge", "Grade", "Description", "Offense Dt.", "Disposition"])
    
for fname in fnames:
    if "Statute" in sectioned[fname]["CHARGES"]:
        charges_df = extract_charges_MC(sectioned[fname]["CHARGES"])
    else:
        charges_df = extract_charges_MJ(sectioned[fname]["CHARGES"])
    print(fname) 
    print(charges_df)
    print()

../../data/example_docketsheets_courtsummaries/DS_ MJ-17302-CR-0000035-2015.pdf
   #             Charge Grade                                   Description  \
0  1   75 § 3802 §§ A1*     M  DUI: Gen Imp/Inc of Driving Safely - 1st Off   
1  2  75 § 3802 §§ B* M        DUI: High Rte of Alc (Bac.10 - <.16) 1st Off   

  Offense Dt.       Disposition  
0  03/19/2015  Waived for Court  
1  03/19/2015  Waived for Court  

../../data/example_docketsheets_courtsummaries/DS_CP-05-CR-0000151-2006.pdf
  Seq Orig Seq Grade           Statute  \
0   1        2    M1  75 § 3802 §§ C**   

                      Statute Description Offense Dt.         OTN  
0  DUI: Highest Rte of Alc (BAC .16+) 2nd  01/23/2006  K 367492-6  

../../data/example_docketsheets_courtsummaries/DS_CP-27-CR-0000035-2005.pdf
  Seq Orig Seq Grade         Statute                 Statute Description  \
0   1        1    M2  18 § 3928 §§ A     Unauth Use Motor/Other Vehicles   
1   3        3    M2  18 § 3921 §§ A  Theft By Unlaw 

In [11]:
[print(sectioned[fname]["STATUS INFORMATION"]) for fname in fnames]

Case Status    Status Date Processing Status                                    
    Closed         06/04/2015 Case Transferred to Court of Common Pleas             
                   06/02/2015 Completed                                             
                   04/14/2015 Awaiting Preliminary Hearing
Case Status: Closed  Status Date Processing Status        Complaint Date: 02/09/2006
                          07/26/2006 Sentenced/Penalty Imposed                      
                          06/12/2006 Awaiting Sentencing                            
                          06/07/2006 Awaiting Plea Court                            
                          05/02/2006 Awaiting Status Conference                     
                          04/11/2006 Awaiting Formal Arraignment                    
                          03/23/2006 Awaiting Filing of Information
Case Status: Closed  Status Date Processing Status          Arrest Date: 04/14/2005
                          11

In [27]:
def extract_status_information(text: str):
    # Extract the case status
    case_status_pattern = r"Case Status\s*:\s*(\w+)|Case Status\s+(\w+)"
    case_status_match = re.search(case_status_pattern, text)
    if case_status_match:
        case_status = case_status_match.group(1) if case_status_match.group(1) else case_status_match.group(2)
    else:
        case_status = None
    if case_status == "Status":
        case_status = text.split("\n")[1].strip().split()[0]

    # Remove the case status line from the text
    text = re.sub(case_status_pattern, '', text)

    # Extract the status date and processing status
    status_pattern = r"(\d{2}/\d{2}/\d{4})\s+(.+)"
    status_matches = re.findall(status_pattern, text)

    # Create a DataFrame
    status_df = pd.DataFrame(status_matches, columns=["status_date", "processing_status"])
    status_df["processing_status"] = status_df["processing_status"].str.strip()

    return case_status, status_df

# Example usage
text = """Case Status    Closed
    Status Date Processing Status                                    
    07/30/2020 Case Transferred to Court of Common Pleas             
    07/23/2020 Completed                                             
    07/23/2020 Case Balance Due                                      
    07/23/2020 Completed                                             
    04/02/2020 Awaiting Preliminary Hearing                          
    04/02/2020 Awaiting Preliminary Arraignment"""

case_status, status_df = extract_status_information(text)
print(case_status)
print(status_df)

# Additional example usage
text = """Case Status    Status Date Processing Status                                    
    Closed         06/04/2015 Case Transferred to Court of Common Pleas             
                   06/02/2015 Completed                                             
                   04/14/2015 Awaiting Preliminary Hearing"""

case_status, status_df = extract_status_information(text)
print(case_status)
print(status_df)

Closed
  status_date                          processing_status
0  07/30/2020  Case Transferred to Court of Common Pleas
1  07/23/2020                                  Completed
2  07/23/2020                           Case Balance Due
3  07/23/2020                                  Completed
4  04/02/2020               Awaiting Preliminary Hearing
5  04/02/2020           Awaiting Preliminary Arraignment
Closed
  status_date                          processing_status
0  06/04/2015  Case Transferred to Court of Common Pleas
1  06/02/2015                                  Completed
2  04/14/2015               Awaiting Preliminary Hearing


In [28]:
for fname in fnames:
    print(fname)
    print(extract_status_information(sectioned[fname]["STATUS INFORMATION"]))
    print()

../../data/example_docketsheets_courtsummaries/DS_ MJ-17302-CR-0000035-2015.pdf
('Closed',   status_date                          processing_status
0  06/04/2015  Case Transferred to Court of Common Pleas
1  06/02/2015                                  Completed
2  04/14/2015               Awaiting Preliminary Hearing)

../../data/example_docketsheets_courtsummaries/DS_CP-05-CR-0000151-2006.pdf
('Closed',   status_date                     processing_status
0  02/09/2006  07/26/2006 Sentenced/Penalty Imposed
1  06/12/2006                   Awaiting Sentencing
2  06/07/2006                   Awaiting Plea Court
3  05/02/2006            Awaiting Status Conference
4  04/11/2006           Awaiting Formal Arraignment
5  03/23/2006        Awaiting Filing of Information)

../../data/example_docketsheets_courtsummaries/DS_CP-27-CR-0000035-2005.pdf
('Closed',    status_date                  processing_status
0   04/14/2005               11/14/2005 Completed
1   09/22/2005  Awaiting Appellate Cour

In [29]:
key = "CALENDAR EVENTS"
[print(sectioned[fname][key]) for fname in fnames]

Case Calendar    Schedule                                        Schedule       
    Event Type       Start Date Start Time Room    Judge Name        Status         
    Preliminary Hearing 05/12/2015  3:00 pm        Magisterial District Judge  Continued
                                                   Jeffrey L. Mensch
Case Calendar  Schedule  Start Room       Judge Name          Schedule        
      Event Type    Start Date Time                                 Status          
      Plea Court    06/12/2006  9:00 am Courtroom 1                 Scheduled       
      Parole Hearing 11/06/2006  3:00 pm Courtroom 1                Scheduled
Case Calendar  Schedule  Start Room       Judge Name          Schedule        
      Event Type    Start Date Time                                 Status          
      Formal Arraignment 06/03/2005  9:00 am                        Scheduled
Case Calendar  Schedule  Start Room       Judge Name          Schedule        
      Event Type    Start Da

In [34]:
def extract_calendar_events(text: str) -> pd.DataFrame:
    """
    Extracts calendar events from the given text.
    
    Args:
        text (str): The text containing the calendar events.
        
    Returns:
        pd.DataFrame: A DataFrame containing the extracted events with the following columns:
        - "event_type" (str): The type of the event.
        - "start_datetime" (str): The start date and time of the event.
        - "room" (str): The room where the event is scheduled.
        - "judge" (str): The judge assigned to the event.
        - "status" (str): The status of the event.
    """
    # Define regular expression to extract the events
    event_pattern = re.compile(
        r'(?P<start_datetime>\d{2}/\d{2}/\d{4}\s+\d{2}:\d{2}\s[APM]{2})\s+(?P<event_type>.+?)\s+(?P<room>.+?)\s+(?P<judge>.+?)\s+(?P<status>.+)',
        re.MULTILINE
    )
    
    # Extract the events using the regular expression
    events_matches = event_pattern.findall(text)
    
    # Create a DataFrame from the extracted events
    events_df = pd.DataFrame(events_matches, columns=["start_datetime", "event_type", "room", "judge", "status"])
    
    return events_df

# Example usage
text = """Case Calendar  Schedule  Start Room       Judge Name          Schedule        
    06/04/2015 10:00 AM  Preliminary Hearing  Scheduled  Judge 
                                                         John Doe
    06/02/2015 09:00 AM  Arraignment  Completed  Judge Jane Smith
    04/14/2015 01:30 PM  Status Conference  Continued  Judge Richard Roe"""

events_df = extract_calendar_events(text)
print(events_df)

        start_datetime   event_type        room      judge             status
0  06/04/2015 10:00 AM  Preliminary     Hearing  Scheduled             Judge 
1  06/02/2015 09:00 AM  Arraignment   Completed      Judge         Jane Smith
2  04/14/2015 01:30 PM       Status  Conference  Continued  Judge Richard Roe


In [35]:
for fname in fnames:
    print(fname)
    print(extract_status_information(sectioned[fname][key]))
    print()

../../data/example_docketsheets_courtsummaries/DS_ MJ-17302-CR-0000035-2015.pdf
(None,   status_date                                  processing_status
0  05/12/2015  3:00 pm        Magisterial District Judge  Con...)

../../data/example_docketsheets_courtsummaries/DS_CP-05-CR-0000151-2006.pdf
(None,   status_date                              processing_status
0  06/12/2006  9:00 am Courtroom 1                 Scheduled
1  11/06/2006   3:00 pm Courtroom 1                Scheduled)

../../data/example_docketsheets_courtsummaries/DS_CP-27-CR-0000035-2005.pdf
(None,   status_date                         processing_status
0  06/03/2005  9:00 am                        Scheduled)

../../data/example_docketsheets_courtsummaries/DS_MC-51-CR-0016214-2020.pdf
(None,   status_date                                  processing_status
0  08/27/2020      5:58 pm B08                         Scheduled
1  10/14/2020  10:00 am 404     Judge Patrick F. Dugan Scheduled
2  12/02/2020      9:00 am 200        