In [104]:
# import os
# os.chdir("../")

In [105]:
import pandas as pd
import deba
from lib.clean import clean_dates, standardize_desc_cols
from lib.columns import clean_column_names, set_values


def drop_tracking_id_row_if_not_2020(df):
    df.loc[:, "tracking_id"] = (
        df.tracking_id.str.lower()
        .str.strip()
        .str.replace(r"^(15|16|17|18|19|201[6789]|202[125]|21)(.+)?", "", regex=True)
        .fillna("")
        .str.replace(r"221-(.+)", "", regex=True)
    )
    return df[~((df.tracking_id == ""))]


def drop_receive_date_row_if_not_2020(df):
    df.loc[:, "receive_date"] = df.receive_date.str.replace(
        r"20(1[56789]|21)(.+)", "", regex=True
    )
    return df[~((df.receive_date == ""))]


def drop_receive_year_row_if_not_2020(df):
    df.loc[:, "receive_year"] = (
        df.receive_year.astype(str)
        .str.replace(r"\.0", "", regex=True)
        .fillna("")
        .str.replace(r"20(0[123456789]|1[0123456789]|21)(.+)?", "", regex=True)
    )
    return df[~((df.receive_year == ""))]


def drop_occur_date_row_if_not_2020(df):
    df.loc[:, "occur_year"] = (
        df.occur_year.astype(str)
        .str.replace(r"\.0", "", regex=True)
        .fillna("")
        .str.replace(r"20(1[123456789]|21)", "", regex=True)
    )
    return df[~((df.occur_year == ""))]


def drop_investigation_complete_year_row_if_not_2020(df):
    df.loc[:, "investigation_complete_year"] = (
        df.investigation_complete_year.astype(str)
        .str.replace(r"\.0", "", regex=True)
        .fillna("")
        .str.replace(r"20(1[123456789]|21)", "", regex=True)
    )
    return df[~((df.investigation_complete_year == ""))]


def drop_incident_year_row_if_not_2020(df):
    df.loc[:, "incident_year"] = df.incident_year.astype(str).str.replace(
        r"20(1[123456789]|21)(.+)?", "", regex=True
    )
    return df[~((df.incident_year == ""))]


def drop_case_number_row_if_not_2020(df):
    df.loc[:, "case_number"] = (
        df.case_number.astype(str)
        .str.replace(r"20(1[123456789]|21)(.+)?", "", regex=True)
        .str.replace(r" +", "", regex=True)
    )
    return df[~((df.case_number == ""))]


def drop_investigation_start_year_row_if_not_2020(df):
    df.loc[:, "investigation_start_year"] = (
        df.investigation_start_year.astype(str)
        .str.replace(r"\.0", "", regex=True)
        .fillna("")
        .str.replace(r"20(1[123456789]|21)", "", regex=True)
    )
    return df[~((df.investigation_start_year == ""))]


In [106]:

def clean():
    plaquemines_so = (
        pd.read_csv(
            deba.data("clean/cprr_plaquemines_so_2016_2020.csv")
        )
        .pipe(drop_tracking_id_row_if_not_2020)
        .pipe(set_values, {"tmp": "1"})
    )

    west_monroe_pd = pd.read_csv(
        deba.data("clean/cprr_west_monroe_pd_2020.csv")
    ).pipe(set_values, {"tmp": "1"})

    washington_so = (
        pd.read_csv(
            deba.data("clean/cprr_washington_so_2015_2020.csv")
        )
        .pipe(drop_tracking_id_row_if_not_2020)
        .pipe(set_values, {"tmp": "1"})
    )

    tangipahoa_so = (
        pd.read_csv(
            deba.data("clean/cprr_tangipahoa_so_2015_2021.csv")
        )
        .pipe(drop_receive_date_row_if_not_2020)
        .pipe(set_values, {"tmp": "1"})
    )

    st_tammany_so = (
        pd.read_csv(
            deba.data("clean/cprr_st_tammany_so_2011_2021.csv")
        )
        .pipe(drop_occur_date_row_if_not_2020)
        .pipe(set_values, {"tmp": "1"})
    )

    st_landry_so = (
        pd.read_csv(deba.data("clean/cprr_st_landry_so_2020.csv"))
        .pipe(drop_investigation_complete_year_row_if_not_2020)
        .pipe(set_values, {"tmp": "1"})
    )

    scott_pd = (
        pd.read_csv(deba.data("clean/cprr_scott_pd_2020.csv"))
        .pipe(drop_receive_year_row_if_not_2020)
        .pipe(set_values, {"tmp": "1"})
    )

    rayne_pd = (
        pd.read_csv(deba.data("clean/cprr_rayne_pd_2019_2020.csv"))
        .pipe(drop_investigation_complete_year_row_if_not_2020)
        .pipe(set_values, {"tmp": "1"})
    )

    ponchatoula_pd = (
        pd.read_csv(
            deba.data("clean/cprr_ponchatoula_pd_2010_2020.csv")
        )
        .pipe(clean_dates, ["receive_date"])
        .pipe(drop_receive_year_row_if_not_2020)
        .pipe(set_values, {"tmp": "1"})
    )

    natchitoches_so = pd.read_csv(deba.data("clean/cprr_natchitoches_so_2018_21.csv"))\
        .pipe(set_values, {"tmp": "1"})
    

    new_orleans_so = (
        pd.read_csv(deba.data("clean/cprr_st_landry_so_2020.csv"))
        .pipe(drop_investigation_complete_year_row_if_not_2020)
        .pipe(set_values, {"tmp": "1"})
    )

    new_orleans_pd = (
        pd.read_csv(
            deba.data("clean/cprr_new_orleans_da_2016_2020.csv")
        )
        .pipe(clean_column_names)
        .pipe(drop_tracking_id_row_if_not_2020)
        .pipe(set_values, {"tmp": "1"})
    )

    maurice_pd = (
        pd.read_csv(deba.data("clean/cprr_maurice_pd_2020_2021.csv"))
        .pipe(drop_incident_year_row_if_not_2020)
        .pipe(set_values, {"tmp": "1"})
    )

    madisonville_pd = pd.read_csv(
        deba.data("clean/cprr_madisonville_pd_2010_2020.csv")
    ).pipe(set_values, {"tmp": "1"})

    levee_pd = (
        pd.read_csv(deba.data("clean/cprr_levee_pd.csv"))
        .pipe(drop_tracking_id_row_if_not_2020)
        .pipe(set_values, {"tmp": "1"})
    )

    lake_charles_pd = pd.read_csv(
        deba.data("clean/cprr_lake_charles_pd_2020.csv")
    ).pipe(set_values, {"tmp": "1"})

    lafayette_so = (
        pd.read_csv(deba.data("clean/cprr_lafayette_so_2015_2020.csv"))
        .pipe(drop_receive_year_row_if_not_2020)
        .pipe(set_values, {"tmp": "1"})
    )

    lafayette_pd = (
        pd.read_csv(deba.data("clean/cprr_lafayette_pd_2015_2020.csv"))
        .pipe(clean_dates, ["receive_date"])
        .pipe(drop_receive_year_row_if_not_2020)
        .pipe(set_values, {"tmp": "1"})
    )

    houma_pd = (
        pd.read_csv(deba.data("clean/cprr_houma_pd_2019_2021.csv"))\
        .rename(columns={"ecase_number": "case_number"})
        .pipe(drop_case_number_row_if_not_2020)
        .pipe(set_values, {"tmp": "1"})
    )

    hammond_pd = (
        pd.read_csv(deba.data("clean/cprr_hammond_pd_2015_2020.csv"))
        .pipe(drop_tracking_id_row_if_not_2020)
        .pipe(set_values, {"tmp": "1"})
    )

    eunice_pd = (
        pd.read_csv(deba.data("clean/cprr_eunice_pd_2019_2021.csv"))
        .pipe(drop_receive_year_row_if_not_2020)
        .pipe(set_values, {"tmp": "1"})
    )

    erath_pd = (
        pd.read_csv(deba.data("clean/cprr_erath_pd_2018_2020.csv"))
        .pipe(drop_investigation_start_year_row_if_not_2020)
        .pipe(set_values, {"tmp": "1"})
    )

    denham_springs_pd = (
        pd.read_csv(
            deba.data("clean/cprr_denham_springs_pd_2016_2021.csv")
        )
        .pipe(drop_tracking_id_row_if_not_2020)
        .pipe(set_values, {"tmp": "1"})
    )

    cameron_so = pd.read_csv(
        deba.data("clean/cprr_cameron_so_2020.csv")
    ).pipe(set_values, {"tmp": "1"})

    brusly_pd = (
        pd.read_csv(deba.data("clean/cprr_brusly_pd_2020.csv"))
        .pipe(drop_receive_year_row_if_not_2020)
        .pipe(set_values, {"tmp": "1"})
    )

    bossier_city_pd = pd.read_csv(
        deba.data("clean/cprr_bossier_city_pd_2020.csv")
    ).pipe(set_values, {"tmp": "1"})

    benton_pd = (
        pd.read_csv(deba.data("clean/cprr_benton_pd_2015_2021.csv"))
        .pipe(drop_receive_year_row_if_not_2020)
        .pipe(set_values, {"tmp": "1"})
    )

    baton_rouge_so = (
        pd.read_csv(
            deba.data("clean/cprr_baton_rouge_so_2016_2020.csv")
        )
        .pipe(drop_tracking_id_row_if_not_2020)
        .pipe(set_values, {"tmp": "1"})
    )

    baton_rouge_pd = (
        pd.read_csv(deba.data("clean/cprr_baton_rouge_pd_2021.csv"))
        .pipe(drop_receive_year_row_if_not_2020)
        .pipe(set_values, {"tmp": "1"})
    )

    acadia_so = (
        pd.read_csv(deba.data("clean/cprr_acadia_so_2018_2021.csv"))
        .pipe(drop_receive_year_row_if_not_2020)
        .pipe(set_values, {"tmp": "1"})
    )
    
    st_james_so = (
        pd.read_csv(deba.data("clean/cprr_st_james_so_2019_2021.csv"))
        .pipe(drop_receive_year_row_if_not_2020)
        .pipe(set_values, {"tmp": "1"})
    )
    
    abbeville_pd = (
        pd.read_csv(deba.data("clean/cprr_abbeville_pd_2019_2021.csv"))
        .pipe(drop_tracking_id_row_if_not_2020)
        .pipe(set_values, {"tmp": "1"})
    )


    lafourche_so = pd.read_csv(deba.data("clean/cprr_lafourche_so_2019_2021.csv"))\
        .pipe(drop_tracking_id_row_if_not_2020)\
        .pipe(set_values, {"tmp": "1"})

    dfs = [
        abbeville_pd,
        acadia_so,
        baton_rouge_pd,
        baton_rouge_so,
        benton_pd,
        bossier_city_pd,
        brusly_pd,
        cameron_so,
        denham_springs_pd,
        erath_pd,
        eunice_pd,
        hammond_pd,
        houma_pd,
        lafayette_pd,
        lafayette_so,
        lafourche_so,
        lake_charles_pd,
        levee_pd,
        madisonville_pd,
        maurice_pd,
        natchitoches_so,
        new_orleans_pd,
        new_orleans_so,
        ponchatoula_pd,
        plaquemines_so,
        rayne_pd,
        scott_pd,
        st_landry_so,
        st_james_so,
        st_tammany_so,
        tangipahoa_so,
        washington_so,
        west_monroe_pd,
    ]

    df = pd.concat(dfs, join="outer")

    return df

In [107]:
df = clean()

In [108]:
"""
Drop rows missing a disposition value
"""
df = df[~((df.disposition.fillna("") == ""))]

In [109]:
df.agency.nunique()

28

In [110]:
df.disposition.count()

1229

In [111]:
"""
Normalized disposition counts, i.e., 22% of dispositions were sustained after an internal investigation
"""
df.disposition.value_counts(normalize=True)

pending investigation                                                                 0.294548
sustained                                                                             0.218063
unfounded                                                                             0.202604
exonerated                                                                            0.105777
not sustained                                                                         0.077299
withdrawn; mediation                                                                  0.041497
founded                                                                               0.010578
unsustained                                                                           0.008950
cancelled                                                                             0.004882
duplicate investigation                                                               0.003255
no further investigation merited                  

In [112]:
"""
Analyze disciplinary actions for sustained allegations
"""
df = df[df.disposition.isin(["sustained"])]

In [113]:
"""
df is filtered for sustained dispositions
"""
df.disposition.unique()

array(['sustained'], dtype=object)

In [114]:
"""
Fill na values
"""
def fill_nas(df):
    df.loc[:, 'action'] = df.action.str.lower().str.strip().fillna("n/a")
    return df

In [115]:
df = df.pipe(fill_nas)

In [116]:
"""
Standardize text formatting for disciplinary action column
"""
df = df.pipe(standardize_desc_cols, ["action"])

In [117]:
"""
37% of sustained allegations have a corresponding disciplinary action
"""
df.action.value_counts(normalize=True)

n/a                                                                            0.630597
written reprimand                                                              0.033582
termination                                                                    0.029851
resigned                                                                       0.026119
2-day suspension                                                               0.022388
terminated                                                                     0.018657
letter of reprimand                                                            0.018657
lod                                                                            0.018657
3-day suspension                                                               0.014925
suspension                                                                     0.014925
5-day suspension                                                               0.011194
letter of reprimand/good samarit

In [118]:
def drop_na_action_values(df):
    df.loc[:, "action"] = df.action.str.replace(r"^n/a$", "", regex=True)
    return df[~((df.action.fillna("") == ""))]

In [119]:
df = df.pipe(drop_na_action_values)

In [120]:
"""
Disciplinary action data 
"""
df.action.value_counts(normalize=True)

written reprimand                                                              0.090909
termination                                                                    0.080808
resigned                                                                       0.070707
2-day suspension                                                               0.060606
letter of reprimand                                                            0.050505
terminated                                                                     0.050505
lod                                                                            0.050505
3-day suspension                                                               0.040404
suspension                                                                     0.040404
letter of reprimand/good samaritan law training                                0.030303
written                                                                        0.030303
letter of instruction/mandatory 

In [121]:
"""
Group similar disicplinary actions
"""
def group_actions(df):
    df.loc[:, 'action'] = df.action.str.lower().str.strip().fillna("n/a")\
        .str.replace(r'(.+)?suspend?e?d?s?i?o?n?(.+)?', 'suspended', regex=True)\
        .str.replace(r"(.+)?(reprimand|written|letter)(.+)?", "written or verbal reprimand", regex=True)\
        .str.replace(r"^termination$", "terminated", regex=True)\
        .str.replace(r"(.+)?train(.+)?", "training", regex=True)\
        .str.replace(r"(.+)?demotion(.+)?", "demoted", regex=True)\
        .str.replace(r"(.+)?loss of unit(.+)?", "lost unit privileges", regex=True)\
        .str.replace(r"(.+)?counsel(ing)?(.+)?", "counseled", regex=True)\
        .str.replace(r"(.+)?warning(.+)?", "warned", regex=True)
    return df

In [122]:
df = df.pipe(group_actions)

In [123]:
"""
Disciplinary action data after grouping similar actions
"""
df.action.value_counts(normalize=True)

suspended                      0.323232
written or verbal reprimand    0.262626
terminated                     0.131313
resigned                       0.070707
training                       0.050505
lod                            0.050505
counseled                      0.040404
1-day driving school           0.020202
lost unit privileges           0.010101
retired                        0.010101
separated from employment      0.010101
disciplined                    0.010101
other                          0.010101
Name: action, dtype: float64