In [1]:
import polars as pl
from extract_functions import parse_cases, parse_offenses, parse_parties, parse_events_from_pre
import os

In [2]:
df = pl.read_database_uri("SELECT DISTINCT * FROM CaseVerbatim", os.environ["database"])

In [3]:
parsed_cases = parse_cases(df["CaseSummary"].to_list())

In [4]:
case_information = pl.DataFrame({"CaseID": df["CaseID"].to_list(), "CaseInfo": parsed_cases})
case_information = case_information.unnest("CaseInfo")

In [5]:
case_information = case_information.drop("case_id")

case_information.columns = [
 'CaseID',
 'County',
 'DocketNumber',
 'DefendantName',
 'DefendantType',
 'Judge',
 'Classification',
 'FiledDate',
 'TerminationDate']

In [6]:
case_information = case_information.with_columns(
    pl.col("County").map_elements(lambda x: x[0] + x[1:].lower() if x != "" else x)
)

In [7]:
case_information.write_database("CaseInformation", os.environ["database"].replace("//", "///"), if_exists="replace")

In [8]:
parsed_parties = parse_parties(df["Parties"].to_list())
parsed_offenses = parse_offenses(df["OffenseInformation"].to_list())
parsed_actions = [parse_events_from_pre(i) for i in df["RegisterOfActions"].to_list()]

In [9]:
parties = pl.DataFrame({"CaseID": df["CaseID"].to_list(), "Parties": parsed_parties})
parties = parties.explode("Parties").unnest("Parties")

parties = parties.with_columns(
    pl.col("address_lines").map_elements(lambda x: ", ".join(x)).alias("Address")
)

parties = parties.select(["CaseID", "role", "status", "name", "attorney_name", "Address"])
parties.columns = ["CaseID", "Role", "Status", "Name", "AttorneyName", "Address"]
parties.write_database("Parties", os.environ["database"].replace("//", "///"), if_exists = "replace")

In [10]:
offenses = pl.DataFrame({"CaseID": df["CaseID"].to_list(), "Offenses": parsed_offenses})
offenses = offenses.explode("Offenses").unnest("Offenses")

In [11]:
offenses = offenses.drop_nulls(subset=["count","charge"])
offenses = offenses.select(['CaseID','count','charge','offense_class','offense_date','plea'])

In [12]:
offenses = offenses.with_columns(
    pl.col("count").cast(pl.Int64)
)
offenses.columns = ['CaseID','Count','Charge','OffenseClass','OffenseDate','Plea']

In [13]:
offenses.write_database("Offenses", os.environ["database"].replace("//", "///"), if_exists = "replace")

In [14]:
actions = pl.DataFrame({"CaseID": df["CaseID"].to_list(), "actions": parsed_actions})
actions = actions.explode("actions").unnest("actions")

In [15]:
actions = actions.select(["CaseID", "date", "event_type", "initiated_by", "links"])

In [16]:
actions = actions.with_columns(
    pl.col("links").map_elements(lambda x: x[0] if len(x) == 1 else "")
)

In [17]:
actions.columns = ["CaseID", "Date", "ActionType", "InitiatedBy", "Link"]

In [18]:
actions.write_database("Actions", os.environ["database"].replace("//", "///"), if_exists = "replace")