In [69]:
# import os
# os.chdir("../")

In [70]:
import pandas as pd
import deba
from lib.clean import clean_dates
from lib.columns import clean_column_names, set_values


def drop_tracking_id_row_if_not_2020(df):
    df.loc[:, "tracking_id"] = (
        df.tracking_id.str.lower()
        .str.strip()
        .str.replace(r"^(15|16|17|18|19|201[6789]|202[125]|21)(.+)?", "", regex=True)
        .fillna("")
        .str.replace(r"221-(.+)", "", regex=True)
    )
    return df[~((df.tracking_id == ""))]


def drop_receive_date_row_if_not_2020(df):
    df.loc[:, "receive_date"] = df.receive_date.str.replace(
        r"20(1[56789]|21)(.+)", "", regex=True
    )
    return df[~((df.receive_date == ""))]


def drop_receive_year_row_if_not_2020(df):
    df.loc[:, "receive_year"] = (
        df.receive_year.astype(str)
        .str.replace(r"\.0", "", regex=True)
        .fillna("")
        .str.replace(r"20(0[123456789]|1[0123456789]|21)(.+)?", "", regex=True)
    )
    return df[~((df.receive_year == ""))]


def drop_occur_date_row_if_not_2020(df):
    df.loc[:, "occur_year"] = (
        df.occur_year.astype(str)
        .str.replace(r"\.0", "", regex=True)
        .fillna("")
        .str.replace(r"20(1[123456789]|21)", "", regex=True)
    )
    return df[~((df.occur_year == ""))]


def drop_investigation_complete_year_row_if_not_2020(df):
    df.loc[:, "investigation_complete_year"] = (
        df.investigation_complete_year.astype(str)
        .str.replace(r"\.0", "", regex=True)
        .fillna("")
        .str.replace(r"20(1[123456789]|21)", "", regex=True)
    )
    return df[~((df.investigation_complete_year == ""))]


def drop_incident_year_row_if_not_2020(df):
    df.loc[:, "incident_year"] = df.incident_year.astype(str).str.replace(
        r"20(1[123456789]|21)(.+)?", "", regex=True
    )
    return df[~((df.incident_year == ""))]


def drop_case_number_row_if_not_2020(df):
    df.loc[:, "case_number"] = (
        df.case_number.astype(str)
        .str.replace(r"20(1[123456789]|21)(.+)?", "", regex=True)
        .str.replace(r" +", "", regex=True)
    )
    return df[~((df.case_number == ""))]


def drop_investigation_start_year_row_if_not_2020(df):
    df.loc[:, "investigation_start_year"] = (
        df.investigation_start_year.astype(str)
        .str.replace(r"\.0", "", regex=True)
        .fillna("")
        .str.replace(r"20(1[123456789]|21)", "", regex=True)
    )
    return df[~((df.investigation_start_year == ""))]


In [71]:

def clean():
    plaquemines_so = (
        pd.read_csv(
            deba.data("clean/cprr_plaquemines_so_2016_2020.csv")
        )
        .pipe(drop_tracking_id_row_if_not_2020)
        .pipe(set_values, {"tmp": "1"})
    )

    west_monroe_pd = pd.read_csv(
        deba.data("clean/cprr_west_monroe_pd_2020.csv")
    ).pipe(set_values, {"tmp": "1"})

    washington_so = (
        pd.read_csv(
            deba.data("clean/cprr_washington_so_2015_2020.csv")
        )
        .pipe(drop_tracking_id_row_if_not_2020)
        .pipe(set_values, {"tmp": "1"})
    )

    tangipahoa_so = (
        pd.read_csv(
            deba.data("clean/cprr_tangipahoa_so_2015_2021.csv")
        )
        .pipe(drop_receive_date_row_if_not_2020)
        .pipe(set_values, {"tmp": "1"})
    )

    st_tammany_so = (
        pd.read_csv(
            deba.data("clean/cprr_st_tammany_so_2011_2021.csv")
        )
        .pipe(drop_occur_date_row_if_not_2020)
        .pipe(set_values, {"tmp": "1"})
    )

    st_landry_so = (
        pd.read_csv(deba.data("clean/cprr_st_landry_so_2020.csv"))
        .pipe(drop_investigation_complete_year_row_if_not_2020)
        .pipe(set_values, {"tmp": "1"})
    )

    scott_pd = (
        pd.read_csv(deba.data("clean/cprr_scott_pd_2020.csv"))
        .pipe(drop_receive_year_row_if_not_2020)
        .pipe(set_values, {"tmp": "1"})
    )

    rayne_pd = (
        pd.read_csv(deba.data("clean/cprr_rayne_pd_2019_2020.csv"))
        .pipe(drop_investigation_complete_year_row_if_not_2020)
        .pipe(set_values, {"tmp": "1"})
    )

    ponchatoula_pd = (
        pd.read_csv(
            deba.data("clean/cprr_ponchatoula_pd_2010_2020.csv")
        )
        .pipe(clean_dates, ["receive_date"])
        .pipe(drop_receive_year_row_if_not_2020)
        .pipe(set_values, {"tmp": "1"})
    )

    natchitoches_so = pd.read_csv(deba.data("clean/cprr_natchitoches_so_2018_21.csv"))\
        .pipe(set_values, {"tmp": "1"})
    

    new_orleans_so = (
        pd.read_csv(deba.data("clean/cprr_st_landry_so_2020.csv"))
        .pipe(drop_investigation_complete_year_row_if_not_2020)
        .pipe(set_values, {"tmp": "1"})
    )

    new_orleans_pd = (
        pd.read_csv(
            deba.data("clean/cprr_new_orleans_da_2016_2020.csv")
        )
        .pipe(clean_column_names)
        .pipe(drop_tracking_id_row_if_not_2020)
        .pipe(set_values, {"tmp": "1"})
    )

    maurice_pd = (
        pd.read_csv(deba.data("clean/cprr_maurice_pd_2020_2021.csv"))
        .pipe(drop_incident_year_row_if_not_2020)
        .pipe(set_values, {"tmp": "1"})
    )

    madisonville_pd = pd.read_csv(
        deba.data("clean/cprr_madisonville_pd_2010_2020.csv")
    ).pipe(set_values, {"tmp": "1"})

    levee_pd = (
        pd.read_csv(deba.data("clean/cprr_levee_pd.csv"))
        .pipe(drop_tracking_id_row_if_not_2020)
        .pipe(set_values, {"tmp": "1"})
    )

    lake_charles_pd = pd.read_csv(
        deba.data("clean/cprr_lake_charles_pd_2020.csv")
    ).pipe(set_values, {"tmp": "1"})

    lafayette_so = (
        pd.read_csv(deba.data("clean/cprr_lafayette_so_2015_2020.csv"))
        .pipe(drop_receive_year_row_if_not_2020)
        .pipe(set_values, {"tmp": "1"})
    )

    lafayette_pd = (
        pd.read_csv(deba.data("clean/cprr_lafayette_pd_2015_2020.csv"))
        .pipe(clean_dates, ["receive_date"])
        .pipe(drop_receive_year_row_if_not_2020)
        .pipe(set_values, {"tmp": "1"})
    )

    houma_pd = (
        pd.read_csv(deba.data("clean/cprr_houma_pd_2019_2021.csv"))\
        .rename(columns={"ecase_number": "case_number"})
        .pipe(drop_case_number_row_if_not_2020)
        .pipe(set_values, {"tmp": "1"})
    )

    hammond_pd = (
        pd.read_csv(deba.data("clean/cprr_hammond_pd_2015_2020.csv"))
        .pipe(drop_tracking_id_row_if_not_2020)
        .pipe(set_values, {"tmp": "1"})
    )

    eunice_pd = (
        pd.read_csv(deba.data("clean/cprr_eunice_pd_2019_2021.csv"))
        .pipe(drop_receive_year_row_if_not_2020)
        .pipe(set_values, {"tmp": "1"})
    )

    erath_pd = (
        pd.read_csv(deba.data("clean/cprr_erath_pd_2018_2020.csv"))
        .pipe(drop_investigation_start_year_row_if_not_2020)
        .pipe(set_values, {"tmp": "1"})
    )

    denham_springs_pd = (
        pd.read_csv(
            deba.data("clean/cprr_denham_springs_pd_2016_2021.csv")
        )
        .pipe(drop_tracking_id_row_if_not_2020)
        .pipe(set_values, {"tmp": "1"})
    )

    cameron_so = pd.read_csv(
        deba.data("clean/cprr_cameron_so_2020.csv")
    ).pipe(set_values, {"tmp": "1"})

    brusly_pd = (
        pd.read_csv(deba.data("clean/cprr_brusly_pd_2020.csv"))
        .pipe(drop_receive_year_row_if_not_2020)
        .pipe(set_values, {"tmp": "1"})
    )

    bossier_city_pd = pd.read_csv(
        deba.data("clean/cprr_bossier_city_pd_2020.csv")
    ).pipe(set_values, {"tmp": "1"})

    benton_pd = (
        pd.read_csv(deba.data("clean/cprr_benton_pd_2015_2021.csv"))
        .pipe(drop_receive_year_row_if_not_2020)
        .pipe(set_values, {"tmp": "1"})
    )

    baton_rouge_so = (
        pd.read_csv(
            deba.data("clean/cprr_baton_rouge_so_2016_2020.csv")
        )
        .pipe(drop_tracking_id_row_if_not_2020)
        .pipe(set_values, {"tmp": "1"})
    )

    baton_rouge_pd = (
        pd.read_csv(deba.data("clean/cprr_baton_rouge_pd_2021.csv"))
        .pipe(drop_receive_year_row_if_not_2020)
        .pipe(set_values, {"tmp": "1"})
    )

    acadia_so = (
        pd.read_csv(deba.data("clean/cprr_acadia_so_2018_2021.csv"))
        .pipe(drop_receive_year_row_if_not_2020)
        .pipe(set_values, {"tmp": "1"})
    )
    
    st_james_so = (
        pd.read_csv(deba.data("clean/cprr_st_james_so_2019_2021.csv"))
        .pipe(drop_receive_year_row_if_not_2020)
        .pipe(set_values, {"tmp": "1"})
    )
    
    abbeville_pd = (
        pd.read_csv(deba.data("clean/cprr_abbeville_pd_2019_2021.csv"))
        .pipe(drop_tracking_id_row_if_not_2020)
        .pipe(set_values, {"tmp": "1"})
    )


    lafourche_so = pd.read_csv(deba.data("clean/cprr_lafourche_so_2019_2021.csv"))\
        .pipe(drop_tracking_id_row_if_not_2020)\
        .pipe(set_values, {"tmp": "1"})

    dfs = [
        abbeville_pd,
        acadia_so,
        baton_rouge_pd,
        baton_rouge_so,
        benton_pd,
        bossier_city_pd,
        brusly_pd,
        cameron_so,
        denham_springs_pd,
        erath_pd,
        eunice_pd,
        hammond_pd,
        houma_pd,
        lafayette_pd,
        lafayette_so,
        lafourche_so,
        lake_charles_pd,
        levee_pd,
        madisonville_pd,
        maurice_pd,
        natchitoches_so,
        new_orleans_pd,
        new_orleans_so,
        ponchatoula_pd,
        plaquemines_so,
        rayne_pd,
        scott_pd,
        st_landry_so,
        st_james_so,
        st_tammany_so,
        tangipahoa_so,
        washington_so,
        west_monroe_pd,
    ]

    df = pd.concat(dfs, join="outer")

    return df

In [72]:
df = clean()

In [73]:

def clean_disposition_2020(df):
    df.loc[:, "disposition"] = (
        df.disposition.str.lower()
        .str.strip()
        .str.replace(r"^nfim$", "no further investigation merited", regex=True)
        .str.replace(r'^resigned(.+)', 'resigned', regex=True)
        .str.replace(r'^unsustained$', 'not sustained', regex=True)
        .str.replace(',', '', regex=False).fillna('')
    )
    return df[~((df.disposition == ''))]

In [74]:
df = df.pipe(clean_disposition_2020)

In [75]:
df.agency.nunique()

28

In [76]:
df.disposition.count()

1229

In [77]:
# 22% of allegations were sustained 
df.disposition.value_counts(normalize=True)

pending investigation                                                                 0.294548
sustained                                                                             0.219691
unfounded                                                                             0.202604
exonerated                                                                            0.105777
not sustained                                                                         0.086249
withdrawn; mediation                                                                  0.041497
founded                                                                               0.010578
cancelled                                                                             0.004882
duplicate investigation                                                               0.003255
resigned                                                                              0.003255
active                                            

In [78]:

def clean_action_if_sustained(df):
    df = df[df.disposition.isin(["sustained"])]
    df.loc[:, 'action'] = df.action.str.lower().str.strip().fillna("n/a")\
        .str.replace(r'(.+)?suspend?e?d?s?i?o?n?(.+)?', 'suspended', regex=True)\
        .str.replace(r"(.+)?(reprimand|written|letter)(.+)?", "written or verbal reprimand", regex=True)\
        .str.replace(r"^terminated$", "termination", regex=True)\
        .str.replace(r"(.+)?train(.+)?", "training", regex=True)
    return df

In [79]:
df = df.pipe(clean_action_if_sustained)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'action'] = df.action.str.lower().str.strip().fillna("n/a")\


In [80]:
# 37% of sustained allegations have a corresponding disciplinary action
df.action.value_counts(normalize=True)

n/a                            0.625926
suspended                      0.125926
written or verbal reprimand    0.096296
termination                    0.048148
resigned                       0.025926
training                       0.018519
lod                            0.018519
verbal counseling              0.011111
1-day driving school           0.007407
5-day loss of unit             0.003704
retired                        0.003704
separated from employment      0.003704
disciplined                    0.003704
other                          0.003704
counseled                      0.003704
Name: action, dtype: float64

In [81]:
def drop_na_action_rows_2020(df):
    df.loc[:, "action"] = df.action.str.replace(r"^n/a$", "", regex=True)
    return df[~((df.action.fillna("") == ""))]

In [82]:
df = df.pipe(drop_na_action_rows_2020)

In [83]:
# of the 37% with a corresponding disciplinary action:
# 34% resulted in a suspension; 26% in a written or verbal reprimand; 13% in a termination; 7% in a resignation
df.action.value_counts(normalize=True)

suspended                      0.336634
written or verbal reprimand    0.257426
termination                    0.128713
resigned                       0.069307
training                       0.049505
lod                            0.049505
verbal counseling              0.029703
1-day driving school           0.019802
5-day loss of unit             0.009901
retired                        0.009901
separated from employment      0.009901
disciplined                    0.009901
other                          0.009901
counseled                      0.009901
Name: action, dtype: float64