In [33]:
from re import T
import sys
from numpy import result_type
sys.path.append('../')
import pandas as pd 
import bolo
from lib.columns import clean_column_names, set_values
from lib.clean import clean_dates, standardize_desc_cols
from lib.uid import gen_uid


def clean_tracking_number(df):
    df.loc[:, 'tracking_number'] = df.internal_affairs.fillna('').str.lower().str.strip()\
        .str.replace(r'(ia-|ia )', '', regex=True)\
        .str.replace('.', '-', regex=False)\
        .str.replace('2018', '18', regex=False)\
        .str.replace('1801', '18-01', regex=False)
    return df.drop(columns='internal_affairs')


def clean_incident_date(df):
    df.incident_date = df.incident_date.fillna('')\
        .str.replace('7/2010 til 5/2012', '07/01/2010', regex=False)
    return df


def clean_disposition(df):
    df.loc[:, 'disposition'] = df.finding.fillna('').str.lower().str.strip()\
        .str.replace('/', ' | ')\
        .str.replace('no action', '', regex=False)
    return df.drop(columns='finding')


def split_name(df):
    names = df.ee_name.str.lower().str.strip().str.extract(r'^(\w+) (\w+)')
    df.loc[:, 'first_name'] = names[0]
    df.loc[:, 'last_name'] = names[1]
    return df.drop(columns='ee_name')


def clean_action(df):
    df.loc[:, 'action'] = df.final_recommendation.fillna('').str.lower().str.strip()\
        .str.replace(
            'three (3) consecutive shift-days suspenion without pay',
            '3-day suspension without pay', regex=False)\
        .str.replace(
            '5-five consecutive shift-days suspension without pay',
            '5-day suspension without pay', regex=False)\
        .str.replace(
            '| day suspension commence 1/28/17',
            'suspended', regex=False)\
        .str.replace('days suspension 2 days', '2-day suspension', regex=False)\
        .str.replace(r'^(\w+) resigned', 'resigned', regex=True)\
        .str.replace('suspended without pay period of 3 days', '3-day suspension', regex=False)\
        .str.replace('frrom', 'from', regex=False)\
        .str.replace('edwin bergeron 10/2/2019', '', regex=False)\
        .str.replace(r'^(\d+) (\w+)', r'\1-\2', regex=True)\
        .str.replace(' / ', ' | ', regex=False)
    return df.drop(columns='final_recommendation')


def combine_columns(df):
    def combine(row):
        txts = []
        if pd.notnull(row.violation_1):
            txts.append('%s' % row.violation_1)
        if pd.notnull(row.violation_2_3):
            txts.append('%s' % row.violation_2_3)
        return '| '.join(txts)
    df.loc[:, 'charges'] = df.apply(combine, axis=1, result_type='reduce')
    df = df.drop(columns=['violation_1', 'violation_2_3'])
    return df


def clean_charges(df):
    df.charges = df.charges.str.lower().str.strip().fillna('')\
        .str.replace(r'^(\d+) (\w+)', r'\1 - \2', regex=True)\
        .str.replace('.', '', regex=False)\
        .str.replace(r' \bgo\b ', ' ', regex=True)\
        .str.replace('/', ' and ', regex=False)\
        .str.replace(r'\(founded\)', '', regex=True)\
        .str.replace(r'^excessive force ', 'excessive force | ', regex=True)\
        .str.replace('omissionn', 'omission', regex=False)\
        .str.replace(r' ?(unbec?o?i?mo?ing) ?( ?(of)? ?(an officer)?)? ?', ' unbecoming ', regex=True)\
        .str.replace('video conduct unbecoming', 'video | conduct unbecoming', regex=False)
    return df


def combine_duplicate_action_disposition_and_charges_rows(df):
    df = df.groupby(['uid', 'tracking_number']).agg({
        'first_name': 'first',
        'last_name': 'first',
        'department_desc': 'first',
        'investigation_start_date': 'first',
        'disposition': ''.join, 
        'action': ''.join,
        'charges': ' | '.join,
        'incident_year': 'first',
        'incident_month': 'first',
        'incident_day': 'first',
        'data_production_year': 'first',
        'agency': 'first',
        'complaint_uid': 'first'
        }).reset_index()
    return df


def drop_rows_without_tracking_number(df):
    return df[df.tracking_number != ''].reset_index(drop=True)


def clean():
    df = pd.read_csv(bolo.data('hammond_pd/hammond_pd_cprr_2015_2020.csv'))\
        .pipe(clean_column_names)
    df = df\
        .rename(columns={
            'dept': 'department_desc',
        })\
        .pipe(split_name)\
        .pipe(clean_tracking_number)\
        .pipe(clean_incident_date)\
        .pipe(clean_disposition)\
        .pipe(clean_action)\
        .pipe(combine_columns)\
        .pipe(clean_charges)\
        .pipe(clean_dates, ['incident_date'])\
        .pipe(drop_rows_without_tracking_number)\
        .pipe(standardize_desc_cols, ['department_desc', 'action', 'charges'])\
        .pipe(set_values, {
            'agency': 'Hammond PD',
            'data_production_year': '2021'
        })\
        .pipe(gen_uid, ['first_name', 'last_name', 'agency'])\
        .pipe(gen_uid, 
        ['uid', 'charges', 'tracking_number', 'disposition', 'investigation_start_date'], 'complaint_uid')\
        .pipe(combine_duplicate_action_disposition_and_charges_rows)
    return df


if __name__ == '__main__':
    df = clean()
    ensure_data_dir('clean')
    df.to_csv(bolo.data('clean/cprr_hammond_pd_2015_2020.csv'), index=False)


In [34]:

pd.set_option('display.max_colwidth', None)
df

Unnamed: 0,uid,tracking_number,first_name,last_name,department_desc,investigation_start_date,disposition,action,charges,incident_year,incident_month,incident_day,data_production_year,agency,complaint_uid
0,10c0c394b40633de6563a4af974295cb,15-10,corey,stewart,police,30-Oct-15,founded,3-day suspension without pay,108 - failure to carry out orders regarding 151 body cam | excessive forcecommision or omission of any act to the prejudice of the department service contrary to the public interest or policy,2015,10,17,2021,Hammond PD,f2ab4e9acf7fae417f76084f97d0cd82
1,1e65c0807675ee3f25f6a6bf25eb121b,15-05,patrick,peterman,police,8-Jan-15,unsubstantiated,no action,conduct unbecoming,2015,5,25,2021,Hammond PD,bde3dfde7954cca14f9bcfb8020874b4
2,573a76ab5827db00f65fc20b0ad22a36,19-05,marquis,reliford,police,31-May-19,founded,resigned,conduct unbecoming,2019,5,28,2021,Hammond PD,81970f7b4e6071327fab34d7c46f97b7
3,76aee80059f22ad98ea8e55ff97dc658,20-07,jared,tate,police,17-Sep-20,unfounded,no action,conduct unbecoming,2020,9,8,2021,Hammond PD,83ff269676e55cc75ef955d755781e48
4,7f2e287a293035250ce9d549b9df7705,15-03,thomas,mushinsky,police,19-Feb-15,unsubstantiated,no action,conduct unbecoming,2015,1,29,2021,Hammond PD,6a3443d650a5d8123429e8131136c144
5,7f2e287a293035250ce9d549b9df7705,15-09,thomas,mushinsky,police,30-Oct-15,,5-day suspension without pay,108 - failure to carry out orders regarding 230 towing and impound | excessive forcecommision or omission of any act to the prejudice of the department service contrary to the public interest or policy,2015,10,17,2021,Hammond PD,8afd1e03bd85c99a2c28431901fd0a81
6,83e7e34218653346a7d4b5a1d89964a2,19-04,dayton,house,police,2-Jul-19,founded,dismissed from service,conduct unbecoming | social media policy,2019,4,23,2021,Hammond PD,23ef313946abe4ed7f89255d7fe8ce27
7,92d3d9c79a3eb44ece5c83e62e55b91d,19-12,jeanne,truz,police,12-Nov-19,unfounded | unsubstantiated,no action,conduct unbecoming | command of temper,2019,11,7,2021,Hammond PD,4c06651c82f0d08f74dea6d272ba660b
8,a52c3653f9d8d31a92302fe6c3ac366a,17-04,craig,dunn,police,29-Sep-17,,no action,use of force continuum | activation of the body cam,2017,8,28,2021,Hammond PD,26f3341ea108229e4ba6e35f9eff8a55
9,a5f3c016d4c3373aa74dc15c0638362e,16-10,jeanne,cruz,police,,founded | preventable class 2,1-day supension,conduct unbecoming,2016,10,28,2021,Hammond PD,3b65640e70e6129d68c26d2afa4ad51e


In [35]:
df.loc[:, ['tracking_number', 'first_name', 'last_name']].iloc[:30]

Unnamed: 0,tracking_number,first_name,last_name
0,15-10,corey,stewart
1,15-05,patrick,peterman
2,19-05,marquis,reliford
3,20-07,jared,tate
4,15-03,thomas,mushinsky
5,15-09,thomas,mushinsky
6,19-04,dayton,house
7,19-12,jeanne,truz
8,17-04,craig,dunn
9,16-10,jeanne,cruz
