In [161]:
%load_ext rich
from ipywidgets import interact
from docket_extract import *
import io
import itertools
import os
import glob
import pandas as pd

The rich extension is already loaded. To reload it, use:
  %reload_ext rich


In [162]:
ds_dir = "../../data/example_docketsheets_courtsummaries/"
fnames = list(sorted(glob.glob(os.path.join(ds_dir, "*DS_*pdf"))))
fnames

In [163]:
sectioned = {fname: extract_sections(extract_text_from_pdf(fname)) for fname in fnames}


In [164]:
all_keys = set(itertools.chain(*[list(d.keys()) for d in sectioned.values()]))
key_count = {k: sum([k in d for d in sectioned.values()]) for k in all_keys}
list(sorted(key_count.items(), key=lambda x: x[1], reverse=True))

In [165]:

replacements = {
    "DISPOSITION SENTENCING/PENALTIES": "DISPOSITION / SENTENCING DETAILS"
    }
remove_keys = ["", "PAYMENT PLAN SUMMARY"]
for fname, sdict in sectioned.items():
    for replace_source, replace_target in replacements.items():
        if replace_source in sdict:
            sdict[replace_target] = sdict.pop(replace_source)
    for k in remove_keys:
        if k in sdict:
            sdict.pop(k)
all_keys = set(itertools.chain(*[list(d.keys()) for d in sectioned.values()]))
key_count = {k: sum([k in d for d in sectioned.values()]) for k in all_keys}
list(sorted(key_count.items(), key=lambda x: x[1], reverse=True))

In [166]:
[print(sectioned[fname]["DEFENDANT INFORMATION"]) for fname in fnames]

Name:         Young, Larry Lewis       Sex:         Male                        
    Date of Birth: 08/01/1951              Race:        White                       
    Address(es):                                                                    
    Home                                                                            
    Watsontown, PA 17777                                                            
    Advised of His Right to Apply for Assignment of Counsel? No                     
    Public Defender Requested by the Defendant? No                                  
    Application Provided for Appointment of Public Defender? Yes                    
    Has the Defendant Been Fingerprinted? Yes
Date Of Birth:  10/16/1950 City/State/Zip:  Roaring Spring, PA  16673
Date Of Birth:  06/14/1959 City/State/Zip:  Oil City, PA  16301
Date Of Birth:  09/28/1988 City/State/Zip:  Philadelphia, PA  19120
Name:         Fernacz, Richard         Sex:         Male                    

In [167]:

def collect_defendant_information(text: str) -> dict[str, str | None]:
    """Extracts the defendant's information from the given text.
        
    Args:
        text (str): The text containing the defendant's information.
        
    Returns:
        dict: A dictionary containing the extracted information with the following keys:
        - "dob" (str or None): The date of birth of the defendant.
        - "race" (str or None): The race of the defendant.
        - "sex" (str or None): The sex of the defendant.
        - "counsel_advised" (str or None): Whether the defendant was advised of their right to apply for assignment of counsel.
        - "defender_requested" (str or None): Whether the defendant requested a public defender.
        - "application_provided" (str or None): Whether an application was provided for the appointment of a public defender.
        - "was_fingerprinted" (str or None): Whether the defendant has been fingerprinted.
    """
    # Define regular expressions to extract the required information
    dob_pattern = r"Date of Birth:\s*([\d/]+)"
    race_pattern = r"Race:\s*(\w+)"
    sex_pattern = r"Sex:\s*(\w+)"
    counsel_pattern = r"Advised of His Right to Apply for Assignment of Counsel\?\s*(\w+)"
    defender_requested_pattern = r"Public Defender Requested by the Defendant\?\s*(\w+)"
    application_provided_pattern = r"Application Provided for Appointment of Public Defender\?\s*(\w+)"
    fingerprinted_pattern = r"Has the Defendant Been Fingerprinted\?\s*(\w+)"

    # Extract the information using the regular expressions
    dob_match = re.search(dob_pattern, text, re.IGNORECASE)
    race_match = re.search(race_pattern, text, re.IGNORECASE)
    sex_match = re.search(sex_pattern, text, re.IGNORECASE)
    counsel_match = re.search(counsel_pattern, text, re.IGNORECASE)
    defender_requested_match = re.search(defender_requested_pattern, text, re.IGNORECASE)
    application_provided_match = re.search(application_provided_pattern, text, re.IGNORECASE)
    fingerprinted_match = re.search(fingerprinted_pattern, text, re.IGNORECASE)

    # Get the matched groups
    dob = dob_match.group(1) if dob_match else None
    race = race_match.group(1) if race_match else None
    sex = sex_match.group(1) if sex_match else None
    counsel = counsel_match.group(1) if counsel_match else None
    defender_requested = defender_requested_match.group(1) if defender_requested_match else None
    application_provided = application_provided_match.group(1) if application_provided_match else None
    fingerprinted = fingerprinted_match.group(1) if fingerprinted_match else None

    # Store the extracted information in a dictionary
    extracted_info = {
        "dob": dob,
        "race": race,
        "sex": sex,
        "counsel_advised": counsel,
        "defender_requested": defender_requested,
        "application_provided": application_provided,
        "was_fingerprinted": fingerprinted
    }

    # Print the extracted information
    return extracted_info

In [168]:
[collect_defendant_information(sectioned[fname]["DEFENDANT INFORMATION"]) for fname in fnames]

In [169]:
sectioned[fnames[1]]["DEFENDANT INFORMATION"]

In [170]:
for fname in fnames:
    print(fname)
    print(sectioned[fname]["CHARGES"]) 
    print()

../../data/example_docketsheets_courtsummaries/DS_ MJ-17302-CR-0000035-2015.pdf
#Charge        Grade Description                 Offense Dt. Disposition   
         175 § 3802 §§ A1* M DUI: Gen Imp/Inc of Driving Safely - 1st Off 03/19/2015 Waived for Court
         275 § 3802 §§ B* M  DUI: High Rte of Alc (Bac.10 - <.16) 1st Off 03/19/2015 Waived for Court

../../data/example_docketsheets_courtsummaries/DS_CP-05-CR-0000151-2006.pdf
Seq. Orig Seq. Grade Statute  Statute Description      Offense Dt. OTN       
        1    2     M1   75 § 3802 §§ C** DUI: Highest Rte of Alc (BAC .16+) 2nd 01/23/2006 K 367492-6

../../data/example_docketsheets_courtsummaries/DS_CP-27-CR-0000035-2005.pdf
Seq. Orig Seq. Grade Statute  Statute Description      Offense Dt. OTN       
        1    1     M2   18 § 3928 §§ A Unauth Use Motor/Other Vehicles 02/14/2005 H 029661-2
        3    3     M2   18 § 3921 §§ A Theft By Unlaw Taking-Movable Prop 02/14/2005 H 029661-2
        5    5     M2   18 § 3925 §§ A 

In [171]:
import re

columns = ["Seq.", "Orig Seq.", "Grade",  "Statute", "Statute Description", "Offense Dt.",  "OTN"]
def extract_charges_MC(text):
    pattern = re.compile(
        r'(?P<Seq>\d+)\s+(?P<Orig_Seq>\d+)\s+(?P<Grade>\w*)\s+(?P<Statute>\d+\s§\s\d+(?:\s§§\s\w*\**)?|\d+\s§\s\d+)\s+(?P<Statute_Description>.+?)\s+(?P<Offense_Dt>\d{2}/\d{2}/\d{4})\s+(?P<OTN>\w+\s\d+-\d+)',
        re.MULTILINE 
    )
    matches = pattern.findall(text)
    return pd.DataFrame(matches, columns=["Seq", "Orig Seq", "Grade", "Statute", "Statute Description", "Offense Dt.", "OTN"])

def extract_charges_MJ(text):
    pattern = re.compile(
        r'(?P<Num>\d)(?P<Charge>\d+\s§\s\d+(?:\s§§\s\w*\**)?(?:\s\w*\s*)?)\s+(?P<Grade>\w*)\s+(?P<Description>.+?)\s+(?P<Offense_Dt>\d{2}/\d{2}/\d{4})\s+(?P<Disposition>.+)',
        re.MULTILINE
    )
    matches = pattern.findall(text)
    return pd.DataFrame(matches, columns=["#", "Charge", "Grade", "Description", "Offense Dt.", "Disposition"])
    
for fname in fnames:
    if "Statute" in sectioned[fname]["CHARGES"]:
        charges_df = extract_charges_MC(sectioned[fname]["CHARGES"])
    else:
        charges_df = extract_charges_MJ(sectioned[fname]["CHARGES"])
    print(fname) 
    print(charges_df)
    print()

../../data/example_docketsheets_courtsummaries/DS_ MJ-17302-CR-0000035-2015.pdf
   #             Charge Grade                                   Description  \
0  1   75 § 3802 §§ A1*     M  DUI: Gen Imp/Inc of Driving Safely - 1st Off   
1  2  75 § 3802 §§ B* M        DUI: High Rte of Alc (Bac.10 - <.16) 1st Off   

  Offense Dt.       Disposition  
0  03/19/2015  Waived for Court  
1  03/19/2015  Waived for Court  

../../data/example_docketsheets_courtsummaries/DS_CP-05-CR-0000151-2006.pdf
  Seq Orig Seq Grade           Statute  \
0   1        2    M1  75 § 3802 §§ C**   

                      Statute Description Offense Dt.         OTN  
0  DUI: Highest Rte of Alc (BAC .16+) 2nd  01/23/2006  K 367492-6  

../../data/example_docketsheets_courtsummaries/DS_CP-27-CR-0000035-2005.pdf
  Seq Orig Seq Grade         Statute                 Statute Description  \
0   1        1    M2  18 § 3928 §§ A     Unauth Use Motor/Other Vehicles   
1   3        3    M2  18 § 3921 §§ A  Theft By Unlaw 

In [172]:
[print(sectioned[fname]["STATUS INFORMATION"]) for fname in fnames]

Case Status    Status Date Processing Status                                    
    Closed         06/04/2015 Case Transferred to Court of Common Pleas             
                   06/02/2015 Completed                                             
                   04/14/2015 Awaiting Preliminary Hearing
Case Status: Closed  Status Date Processing Status        Complaint Date: 02/09/2006
                          07/26/2006 Sentenced/Penalty Imposed                      
                          06/12/2006 Awaiting Sentencing                            
                          06/07/2006 Awaiting Plea Court                            
                          05/02/2006 Awaiting Status Conference                     
                          04/11/2006 Awaiting Formal Arraignment                    
                          03/23/2006 Awaiting Filing of Information
Case Status: Closed  Status Date Processing Status          Arrest Date: 04/14/2005
                          11

In [173]:
def extract_status_information(text: str):
    # Extract the case status
    case_status_pattern = r"Case Status\s*:\s*(\w+)|Case Status\s+(\w+)"
    case_status_match = re.search(case_status_pattern, text)
    if case_status_match:
        case_status = case_status_match.group(1) if case_status_match.group(1) else case_status_match.group(2)
    else:
        case_status = None
    if case_status == "Status":
        case_status = text.split("\n")[1].strip().split()[0]

    # Remove the case status line from the text
    text = re.sub(case_status_pattern, '', text)

    # Extract the status date and processing status
    status_pattern = r"(\d{2}/\d{2}/\d{4})\s+(.+)"
    status_matches = re.findall(status_pattern, text)

    # Create a DataFrame
    status_df = pd.DataFrame(status_matches, columns=["status_date", "processing_status"])
    status_df["processing_status"] = status_df["processing_status"].str.strip()

    return case_status, status_df

# Example usage
text = """Case Status    Closed
    Status Date Processing Status                                    
    07/30/2020 Case Transferred to Court of Common Pleas             
    07/23/2020 Completed                                             
    07/23/2020 Case Balance Due                                      
    07/23/2020 Completed                                             
    04/02/2020 Awaiting Preliminary Hearing                          
    04/02/2020 Awaiting Preliminary Arraignment"""

case_status, status_df = extract_status_information(text)
print(case_status)
print(status_df)

# Additional example usage
text = """Case Status    Status Date Processing Status                                    
    Closed         06/04/2015 Case Transferred to Court of Common Pleas             
                   06/02/2015 Completed                                             
                   04/14/2015 Awaiting Preliminary Hearing"""

case_status, status_df = extract_status_information(text)
print(case_status)
print(status_df)

Closed
  status_date                          processing_status
0  07/30/2020  Case Transferred to Court of Common Pleas
1  07/23/2020                                  Completed
2  07/23/2020                           Case Balance Due
3  07/23/2020                                  Completed
4  04/02/2020               Awaiting Preliminary Hearing
5  04/02/2020           Awaiting Preliminary Arraignment
Closed
  status_date                          processing_status
0  06/04/2015  Case Transferred to Court of Common Pleas
1  06/02/2015                                  Completed
2  04/14/2015               Awaiting Preliminary Hearing


In [174]:
for fname in fnames:
    print(fname)
    print(extract_status_information(sectioned[fname]["STATUS INFORMATION"]))
    print()

../../data/example_docketsheets_courtsummaries/DS_ MJ-17302-CR-0000035-2015.pdf
('Closed',   status_date                          processing_status
0  06/04/2015  Case Transferred to Court of Common Pleas
1  06/02/2015                                  Completed
2  04/14/2015               Awaiting Preliminary Hearing)

../../data/example_docketsheets_courtsummaries/DS_CP-05-CR-0000151-2006.pdf
('Closed',   status_date                     processing_status
0  02/09/2006  07/26/2006 Sentenced/Penalty Imposed
1  06/12/2006                   Awaiting Sentencing
2  06/07/2006                   Awaiting Plea Court
3  05/02/2006            Awaiting Status Conference
4  04/11/2006           Awaiting Formal Arraignment
5  03/23/2006        Awaiting Filing of Information)

../../data/example_docketsheets_courtsummaries/DS_CP-27-CR-0000035-2005.pdf
('Closed',    status_date                  processing_status
0   04/14/2005               11/14/2005 Completed
1   09/22/2005  Awaiting Appellate Cour

In [175]:
key = "CALENDAR EVENTS"
[print(sectioned[fname][key]) for fname in fnames]

Case Calendar    Schedule                                        Schedule       
    Event Type       Start Date Start Time Room    Judge Name        Status         
    Preliminary Hearing 05/12/2015  3:00 pm        Magisterial District Judge  Continued
                                                   Jeffrey L. Mensch
Case Calendar  Schedule  Start Room       Judge Name          Schedule        
      Event Type    Start Date Time                                 Status          
      Plea Court    06/12/2006  9:00 am Courtroom 1                 Scheduled       
      Parole Hearing 11/06/2006  3:00 pm Courtroom 1                Scheduled
Case Calendar  Schedule  Start Room       Judge Name          Schedule        
      Event Type    Start Date Time                                 Status          
      Formal Arraignment 06/03/2005  9:00 am                        Scheduled
Case Calendar  Schedule  Start Room       Judge Name          Schedule        
      Event Type    Start Da

In [176]:

possible_events = ["Preliminary Hearing", "Plea Court", "Parole Hearing", "Formal Arraignment",
                   "Status Conference", "Status", "Trial", "Bail Revocation", "Bail Hearing", "Preliminary Arraignment"]
partial_events = ["Preliminary", "Plea", "Parole", "Formal", "Status", "Bail"]
possible_statuses = ["Scheduled", "Completed", "Continued", "Cancelled"]

def extract_calendar_events(text: str) -> pd.DataFrame:
    """
    Extracts calendar events from the given text.
    
    Args:
        text (str): The text containing the calendar events.
        
    Returns:
        pd.DataFrame: A DataFrame containing the extracted events with the following columns:
        - "event_type" (str): The type of the event.
        - "start_datetime" (str): The start date and time of the event.
        - "room" (str): The room where the event is scheduled.
        - "judge" (str): The judge assigned to the event.
        - "status" (str): The status of the event.
    """
    # Split the text into lines
    lines = text.split('\n')
    
    # Define a pattern to match datetime
    datetime_pattern = re.compile(r'\d{2}/\d{2}/\d{4}\s+\d{1,2}:\d{2}\s*(AM|PM|am|pm)')
    
    # Find lines that contain a datetime and keep their indices
    datetime_indices = [i for i, line in enumerate(lines) if datetime_pattern.search(line)]
    
    # Create a list of start_datetimes
    start_datetimes = [datetime_pattern.search(lines[i]).group() for i in datetime_indices]
    # Remove the datetime from lines that contain them
    for i in datetime_indices:
        lines[i] = datetime_pattern.sub('', lines[i])

    # Initialize a list to store event types
    event_types = []

    # Search for event types in lines noted in datetime_indices
    for i in datetime_indices:
        event_type = next((event for event in possible_events + partial_events if event in lines[i]), None)
        if event_type:
            lines[i] = lines[i].replace(event_type, '').strip()
        if event_type in partial_events and i + 1 < len(lines) and lines[i + 1].strip():
            continuation = lines[i + 1].strip().split()[0].strip()
            if event_type + " " + continuation in possible_events:
                event_type = event_type + " " + continuation
            lines[i + 1] = lines[i+1].replace(continuation, '').strip()
        event_types.append(event_type)

    # Initialize a list to store statuses
    statuses = []

    # Search for statuses in lines noted in datetime_indices
    for i in datetime_indices:
        status = next((stat for stat in possible_statuses if stat in lines[i]), None)
        statuses.append(status)
        if status:
            lines[i] = lines[i].replace(status, '').strip()

    # Initialize a list to store rooms
    rooms = []

    # Define a pattern to match room
    room_pattern = re.compile(r'(\b\d+\b|\b[A-Z]\d+\b|Courtroom: \w+-\d+-\d+)')

    # Search for rooms in lines noted in datetime_indices
    for i in datetime_indices:
        room_match = room_pattern.search(lines[i])
        room = room_match.group() if room_match else None
        rooms.append(room)
        if room:
            lines[i] = lines[i].replace(room, '').strip()
    
    # Initialize a list to store judges
    judges = []
    # Search for judges in lines noted in datetime_indices
    for i in datetime_indices:
        judge = lines[i].strip()
        if i + 1 < len(lines) and lines[i + 1].strip():
            cline = lines[i + 1].strip()
            if len(cline.split()) <= 3:
                judge += " " + cline
        judges.append(judge)
    # Initialize an empty DataFrame
    events_df = pd.DataFrame(list(zip(start_datetimes, event_types, rooms, judges, statuses)),
     columns=["start_datetime", "event_type", "room", "judge", "status"])
    
    
    return events_df

# Example usage
text = """Case Calendar  Schedule  Start Room       Judge Name          Schedule        
      Event Type    Start Date Time                                 Status          
      Preliminary   08/27/2020  5:58 pm B08                         Scheduled       
      Arraignment                                                                   
      Status        10/14/2020 10:00 am 404     Judge Patrick F. Dugan Scheduled    
      Status        12/02/2020  9:00 am 200                         Cancelled       
      Trial         09/16/2021 11:30 am 606     Judge David H. Conroy Continued     
      Trial         12/10/2021 10:00 am 405     Judge David H. Conroy Scheduled"""

events_df = extract_calendar_events(text)
print(events_df)

        start_datetime               event_type room                   judge  \
0  08/27/2020  5:58 pm  Preliminary Arraignment  B08                           
1  10/14/2020 10:00 am                   Status  404  Judge Patrick F. Dugan   
2  12/02/2020  9:00 am                     None  200                           
3  09/16/2021 11:30 am                    Trial  606   Judge David H. Conroy   
4  12/10/2021 10:00 am                    Trial  405   Judge David H. Conroy   

      status  
0  Scheduled  
1  Scheduled  
2  Cancelled  
3  Continued  
4  Scheduled  


In [177]:
for fname in fnames:
    print(fname)
    print(extract_calendar_events(sectioned[fname][key]))
    print()

../../data/example_docketsheets_courtsummaries/DS_ MJ-17302-CR-0000035-2015.pdf
        start_datetime           event_type  room  \
0  05/12/2015  3:00 pm  Preliminary Hearing  None   

                                          judge     status  
0  Magisterial District Judge Jeffrey L. Mensch  Continued  

../../data/example_docketsheets_courtsummaries/DS_CP-05-CR-0000151-2006.pdf
        start_datetime      event_type room                judge     status
0  06/12/2006  9:00 am      Plea Court    1  Courtroom Courtroom  Scheduled
1  11/06/2006  3:00 pm  Parole Hearing    1            Courtroom  Scheduled

../../data/example_docketsheets_courtsummaries/DS_CP-27-CR-0000035-2005.pdf
        start_datetime          event_type  room judge     status
0  06/03/2005  9:00 am  Formal Arraignment  None        Scheduled

../../data/example_docketsheets_courtsummaries/DS_MC-51-CR-0016214-2020.pdf
        start_datetime               event_type room                   judge  \
0  08/27/2020  5:58 

In [178]:
key = "CASE PARTICIPANTS"
[print(sectioned[fname][key]) for fname in fnames]

Participant Type         Participant Name                                       
    Arresting Officer        Andres, Maxwell W.                                     
    Defendant                Young, Larry Lewis
Participant Type        Name                                                   
     Defendant               Claar, Patricia Ann
Participant Type        Name                                                   
     Defendant               Loll, Daniel Mark
Participant Type        Name                                                   
     Defendant               Johnson, Edward
Participant Type         Participant Name                                       
    Defendant                Fernacz, Richard                                       
    Arresting Officer        Stewart, William R.
Participant Type         Participant Name                                       
    Defendant                Carty, Danielle M.                                     
    Arresting Officer   

In [179]:
def extract_case_participants(text):
    text = text.strip().split("\n")
    roles, names = [], []
    for line in text[1:]:
        parts = line.split(maxsplit=1)
        if len(parts) == 2:
            role, name = parts
            roles.append(role.strip())
            names.append(name.strip())
    return pd.DataFrame({"role": roles, "name": names})

In [180]:
defendants = {}
for fname in fnames:
    print(fname)
    participants = extract_case_participants(sectioned[fname][key])
    print(participants)
    defend_row = participants[participants["role"].str.contains("Defendant")]["name"]
    if len(defend_row) > 0:
        defendants[fname] = defend_row.values[0]
    else:
        defendants[fname] = ""
    print()

../../data/example_docketsheets_courtsummaries/DS_ MJ-17302-CR-0000035-2015.pdf
        role                               name
0  Arresting  Officer        Andres, Maxwell W.
1  Defendant                 Young, Larry Lewis

../../data/example_docketsheets_courtsummaries/DS_CP-05-CR-0000151-2006.pdf
        role                 name
0  Defendant  Claar, Patricia Ann

../../data/example_docketsheets_courtsummaries/DS_CP-27-CR-0000035-2005.pdf
        role               name
0  Defendant  Loll, Daniel Mark

../../data/example_docketsheets_courtsummaries/DS_MC-51-CR-0016214-2020.pdf
        role             name
0  Defendant  Johnson, Edward

../../data/example_docketsheets_courtsummaries/DS_MJ-05216-CR-0000076-2022.pdf
        role                                name
0  Defendant                    Fernacz, Richard
1  Arresting  Officer        Stewart, William R.

../../data/example_docketsheets_courtsummaries/DS_MJ-07104-CR-0000001-2023.pdf
        role                              name

In [181]:
key = "DOCKET ENTRY INFORMATION"
[print(sectioned[fname][key]) for fname in fnames if key in sectioned[fname]]

Filed Date Entry                Filer                Applies To                 
    06/02/2015 Waiver of Preliminary Hearing Larry Lewis Young Larry Lewis Young, Defendant
    06/02/2015 Waived for Court     Magisterial District Judge Jeffrey L.  Larry Lewis Young, Defendant
                                    Mensch                                          
    05/05/2015 First Class Summons Accepted Magisterial District Court 17-3-02 Larry Lewis Young, Defendant
    04/20/2015 Certified Summons Accepted Magisterial District Court 17-3-02 Larry Lewis Young, Defendant
    04/14/2015 First Class Fingerprint Order Issued Magisterial District Court 17-3-02 Larry Lewis Young, Defendant
    04/14/2015 Fingerprint Order Issued Magisterial District Court 17-3-02 Larry Lewis Young, Defendant
    04/14/2015 Summons Issued       Magisterial District Court 17-3-02 Larry Lewis Young, Defendant
    04/14/2015 Certified Summons Issued Magisterial District Court 17-3-02 Larry Lewis Young, Defendant


In [182]:
possible_entries = [
    "Waiver of Counsel", "Waived for Court", "Case Transferred to Court of Common",
    "Guilty Plea", "Waiver of Preliminary Hearing", "Commitment Printed - Unable to Post Bail",
    "Criminal Complaint Filed", "Release of Prisoner", "First Class Summons Issued",
    "First Class Summons Accepted", "Summons Issued", "Summons Accepted", "Certified Summons Issued",
    "Certified Summons Accepted", "Docket Transcript Printed", "Court of Common Please Review For",
    "Move to Non-Traffic Case", "Move to Non-Traffic", "Fingerprint Ordered", "Fingerprint order Issues",
    "Disposition Cancelled"
]

def extract_docket_entry(text, defendant_name):
    lines = text.strip().split("\n")[1:]
    if "," in defendant_name:
        defendant_name = f"{defendant_name.split(',')[1]} {defendant_name.split(',')[0]}".strip()
    def_string = f"{defendant_name}, Defendant"
    applies_to, dates, entries, filers = [], [], [], []
    for line in lines:
        line = line.strip()
        if len(line.split()) < 5:
            continue
        if line.endswith(def_string):
            applies_to.append(def_string)
            line = line.replace(def_string, '').strip()
        else:
            applies_to.append("")
        date_match = re.search(r"\d{2}/\d{2}/\d{4}", line)
        if date_match:
            dates.append(date_match.group())
            line = line.replace(date_match.group(), '').strip()
        else:
            dates.append("")
        for entry in possible_entries:
            if entry in line:
                entries.append(entry)
                line = line.replace(entry, '').strip()
                break
        else:
            entries.append("")
        filers.append(line)
    return pd.DataFrame({
        "date": dates,
        "entry": entries,
        "applies_to": applies_to,
        "filer": filers
    })

        


In [183]:
for fname in fnames:
    print(fname)
    if key in sectioned[fname]:
        docket_entry = extract_docket_entry(sectioned[fname][key], defendants[fname])
        print(docket_entry)

../../data/example_docketsheets_courtsummaries/DS_ MJ-17302-CR-0000035-2015.pdf
         date                          entry                    applies_to  \
0  06/02/2015  Waiver of Preliminary Hearing  Larry Lewis Young, Defendant   
1  06/02/2015               Waived for Court  Larry Lewis Young, Defendant   
2  05/05/2015   First Class Summons Accepted  Larry Lewis Young, Defendant   
3  04/20/2015               Summons Accepted  Larry Lewis Young, Defendant   
4  04/14/2015                                 Larry Lewis Young, Defendant   
5  04/14/2015                                 Larry Lewis Young, Defendant   
6  04/14/2015                 Summons Issued  Larry Lewis Young, Defendant   
7  04/14/2015                 Summons Issued  Larry Lewis Young, Defendant   
8  04/14/2015       Criminal Complaint Filed                                 
9  04/14/2015     First Class Summons Issued  Larry Lewis Young, Defendant   

                                               filer  
0     

In [None]:
key = "ATTORNEY INFORMATION"
[print(sectioned[fname][key] + "\n***\n") for fname in fnames if key in sectioned[fname]]

District Attorney                    Assistant District Attorney                
    Name:  David Peter Johnson, Esq.     Name:  Jeffrey J. Crossland, Esq.          
    Representing:  Commonwealth of Pennsylvania Representing:  Commonwealth of Pennsylvania
    Counsel Status:  Active              Counsel Status:  Active                    
    Supreme Court No.:  041281           Supreme Court No.:  047578
***

Private
***

Private                              District Attorney                          
    Name:  Keith J. Bidlingmaier, Esq.   Name:  Bucks County District Attorney's Office
    Representing:  Carty, Danielle M.    Representing:  Commonwealth of Pennsylvania
***

Public Defender                      District Attorney                          
    Name:  Mary Kathryn McAloon, Esq.    Name:  Bucks County District Attorney's Office
    Representing:  Mathiak, Michael Everett Representing:  Commonwealth of Pennsylvania
    Counsel Status:  Active              Counsel Status

In [222]:
attorney_titles = ["District Attorney", "Private", "Public Defender", "Assistant District Attorney", "Court Appointed"]
def extract_attorney_information(text):
    lines = [line.strip() for line in text.strip().split("\n") if line.strip()]
    titles = []
    stop_mark = -1
    while True:
        for title in attorney_titles:
            if lines[0].startswith(title):
                titles.append(title)
                lines[0] = lines[0][len(title):].strip()
                stop_mark = -1
                break
        if not len(lines[0]):
            break
        if stop_mark >= 1:
            raise ValueError(f"Unknown attorney title in {lines[0]}")
        stop_mark += 1
    if len(lines) == 1:
        return pd.DataFrame({"title": titles, "name": [""]*len(titles)})
    names = [name.strip() for name in lines[1].split("Name:") if len(name.strip())]
    return pd.DataFrame({"title": titles, "name": names})


In [224]:
for fname in fnames:
    if key in sectioned[fname]:
        print(fname)
        print(sectioned[fname][key])
        print()
        attornies = extract_attorney_information(sectioned[fname][key])
        print(attornies)
        print()
        print("***")
        print()
        

../../data/example_docketsheets_courtsummaries/DS_ MJ-17302-CR-0000035-2015.pdf
District Attorney                    Assistant District Attorney                
    Name:  David Peter Johnson, Esq.     Name:  Jeffrey J. Crossland, Esq.          
    Representing:  Commonwealth of Pennsylvania Representing:  Commonwealth of Pennsylvania
    Counsel Status:  Active              Counsel Status:  Active                    
    Supreme Court No.:  041281           Supreme Court No.:  047578

                         title                        name
0            District Attorney   David Peter Johnson, Esq.
1  Assistant District Attorney  Jeffrey J. Crossland, Esq.

***

../../data/example_docketsheets_courtsummaries/DS_MJ-05216-CR-0000076-2022.pdf
Private

     title name
0  Private     

***

../../data/example_docketsheets_courtsummaries/DS_MJ-07104-CR-0000001-2023.pdf
Private                              District Attorney                          
    Name:  Keith J. Bidlingmaier, Esq. 

In [225]:
key = "BAIL"
[print(sectioned[fname][key] + "\n***\n") for fname in fnames if key in sectioned[fname]]

Bail Set:                                                        Nebbia Status: None
    Bail Action Type Bail Action Date Bail Type Originating Court Percentage Amount 
    Set             06/02/2015  Nonmonetary   Magisterial District Court     $0.00
***

Bail Set:                                                        Nebbia Status: None
    Bail Action Type Bail Action Date Bail Type Originating Court Percentage Amount 
    Set             03/25/2022  Nonmonetary   Magisterial District Court     $0.00
***

Bail Set:                                                        Nebbia Status: None
    Bail Action Type Bail Action Date Bail Type Originating Court Percentage Amount 
    Set             01/01/2023  Unsecured     Magisterial District Court  $10,000.00
    Change Bail Type 01/02/2023 Monetary      Magisterial District Court  10.00% $75,000.00
***

Bail Set:                                                        Nebbia Status: None
    Bail Action Type Bail Action Date Bail Type

In [227]:
def extract_bail(text):
    lines = text.strip().split("\n")
    bail_set_pattern = re.compile(r"Bail Set:\s+Nebbia Status:\s*(\w+)")
    action_pattern = re.compile(r"(\w+)\s+(\d{2}/\d{2}/\d{4})\s+(\w+)\s+(.+?)\s+(\$[\d,\.]+|\d+\.\d+%?\s*\$[\d,\.]+|\$0\.00)")

    nebbia_status = None
    actions = []

    for line in lines:
        bail_set_match = bail_set_pattern.search(line)
        if bail_set_match:
            nebbia_status = bail_set_match.group(1)
        action_match = action_pattern.search(line)
        if action_match:
            action_type = action_match.group(1)
            action_date = action_match.group(2)
            bail_type = action_match.group(3)
            originating_court = action_match.group(4).strip()
            amount = action_match.group(5).strip()
            actions.append({
                "action_type": action_type,
                "action_date": action_date,
                "bail_type": bail_type,
                "originating_court": originating_court,
                "amount": amount
            })

    return {
        "nebbia_status": nebbia_status,
        "actions": actions
    }

In [228]:
for fname in fnames:
    if key in sectioned[fname]:
        print(fname)
        print(sectioned[fname][key])
        print()
        bail_info = extract_bail(sectioned[fname][key])
        print(bail_info)
        print()
        print("***")
        print()

../../data/example_docketsheets_courtsummaries/DS_ MJ-17302-CR-0000035-2015.pdf
Bail Set:                                                        Nebbia Status: None
    Bail Action Type Bail Action Date Bail Type Originating Court Percentage Amount 
    Set             06/02/2015  Nonmonetary   Magisterial District Court     $0.00

{'nebbia_status': 'None', 'actions': [{'action_type': 'Set', 'action_date': '06/02/2015', 'bail_type': 'Nonmonetary', 'originating_court': 'Magisterial District Court', 'amount': '$0.00'}]}

***

../../data/example_docketsheets_courtsummaries/DS_MJ-05216-CR-0000076-2022.pdf
Bail Set:                                                        Nebbia Status: None
    Bail Action Type Bail Action Date Bail Type Originating Court Percentage Amount 
    Set             03/25/2022  Nonmonetary   Magisterial District Court     $0.00

{'nebbia_status': 'None', 'actions': [{'action_type': 'Set', 'action_date': '03/25/2022', 'bail_type': 'Nonmonetary', 'originating_court'