In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm, tqdm_notebook, tqdm_pandas

from data.models import Allegation, Officer, AllegationCategory
from data_importer.base.storage import AzureStorage
from data_importer.base.cleaner import DataCleaner, strip, titleize, to_int, ZFill

In [2]:
storage = AzureStorage(account_name='cpdbv2data', 
                       account_key='add-your-key-here',
                       share='cleaned-data')

import_db = 'import'

In [3]:
def allegation_category_code(value):
    if pd.isnull(value):
        return value 
    return value.split('-')[0]

## Read Officer Accused data

In [4]:
df = pd.read_csv(
    storage.path_for('complaints/accused.csv'),
    usecols=[
        'CRID', 'Complaint.Category', 'Recommended.Finding', 'Final.Finding', 'Recommended.Discipline',
        'Final.Discipline', 'UID'
    ]
)
df = df.rename(columns={
    'CRID': 'crid',
    'Complaint.Category': 'allegation_category',
    'Recommended.Finding': 'recc_finding',
    'Final.Finding': 'final_finding',
    'Recommended.Discipline': 'recc_outcome',
    'Final.Discipline': 'final_outcome',
    'UID': 'officer_id'
})

## Clean Officer Accused data
#### Do following steps
- Fill all empty `recc_finding` and `final_finding` with `ZZ`
- Fill all empty `recc_outcome` and `final_outcome` with `''`
- Change `final_finding` from `DIS` to `DS`
- Fill `final_outcome_class` with `''`

In [5]:
DataCleaner(
    schema={
        'crid': [strip],
        'allegation_category': [strip, allegation_category_code],
        'recc_outcome': [strip, ZFill(3)],
        'final_outcome': [strip, ZFill(3)],
        'recc_finding': [strip],
        'final_finding': [strip],
    }
).perform(df)

df['recc_finding'] = df['recc_finding'].fillna('ZZ')
df['final_finding'] = df['final_finding'].fillna('ZZ')
df['recc_outcome'] = df['recc_outcome'].fillna('')
df['final_outcome'] = df['final_outcome'].fillna('')
df['final_finding'] = df['final_finding'].replace(to_replace='DIS', value='DS')
df['final_outcome_class'] = ''

100%|██████████| 6/6 [00:13<00:00,  2.21s/it]


#### Get corresponding allegation id, start date and end date

In [6]:
allegation_map = {}
for crid, id in Allegation.objects.using(import_db).values_list('crid', 'id'):
    allegation_map[crid] = {'id': id}

In [7]:
df_allegations = pd.read_csv(
    storage.path_for('complaints/complaints.csv'),
    usecols=['CRID', 'Complaint.Date', 'Closed.Date'],
    converters={
        'CRID': strip
    }
)

df_allegations.drop_duplicates()
df_allegations = df_allegations.where(pd.notnull(df_allegations), None)

df_allegations.apply(
    lambda x: allegation_map[x['CRID']].update({
        'start_date': x['Complaint.Date'],
        'end_date': x['Closed.Date']
    }),
    axis=1
)

df = df.assign(
    allegation_id=df['crid'].apply(lambda x: allegation_map[x]['id']),
    start_date=df['crid'].apply(lambda x: allegation_map[x]['start_date']),
    end_date=df['crid'].apply(lambda x: allegation_map[x].get('end_date', None))
)

del df['crid']

#### Get corresponding officer current age

In [8]:
officer_age_map = {}
df_officers = pd.read_csv(
    storage.path_for('profiles/officer-profiles.csv'),
    usecols=['UID', 'Current.Age']
)
df_officers = df_officers[df_officers['Current.Age'].notnull()]

DataCleaner(
    schema={
        'Current.Age': [to_int]
    }
).perform(df_officers)

df_officers.apply(
    lambda x: officer_age_map.update({
        x['UID']: x['Current.Age']
    }),
    axis=1
)

df = df.assign(
    officer_age=df['officer_id'].apply(lambda x: officer_age_map.get(x, None))
)


100%|██████████| 1/1 [00:00<00:00,  2.35it/s]


#### Get corresponding allegation category

In [9]:
allegation_category_map = {}
for code, id in AllegationCategory.objects.using(import_db).all().values_list('category_code', 'id'):
    allegation_category_map[code] = id

df = df.assign(
    allegation_category_id=df['allegation_category'].apply(lambda x: allegation_category_map.get(x, None) )
)

del df['allegation_category']

#### Replace all `NaN` with `None`

In [10]:
df = df.where(pd.notnull(df), None)

## Import Officer Accused data

#### Delete all officer accuseds before importing

In [11]:
OfficerAllegation.objects.using(import_db).delete()

(105844, {u'data.OfficerAllegation': 105844})

#### Import and print out the imported data

In [12]:
tqdm.pandas(tqdm_notebook(), desc='Importing Officer Accused data')
df.progress_apply(lambda x: OfficerAllegation.objects.using(import_db).create(**x), axis=1)

pd.DataFrame.from_records(OfficerAllegation.objects.using(import_db).all().values())

A Jupyter Widget

Importing Officer Accused data: 125581it [02:38, 792.75it/s]          


Unnamed: 0,allegation_category_id,allegation_id,end_date,final_finding,final_outcome,final_outcome_class,id,officer_age,officer_id,recc_finding,recc_outcome,start_date
0,31.0,37224,2005-02-19,UN,600,,150327,48.0,11948,UN,600,2004-03-11
1,31.0,37224,2005-02-19,UN,600,,150328,48.0,23893,UN,600,2004-03-11
2,31.0,37224,2005-02-19,UN,600,,150329,48.0,10511,UN,600,2004-03-11
3,31.0,37224,2005-02-19,ZZ,,,150330,45.0,8710,ZZ,,2004-03-11
4,31.0,37224,2005-02-19,UN,600,,150331,60.0,18131,UN,600,2004-03-11
5,31.0,37224,2005-02-19,UN,600,,150332,47.0,22811,UN,600,2004-03-11
6,70.0,37225,2004-12-06,ZZ,,,150333,62.0,28447,ZZ,,2004-03-11
7,208.0,37227,2005-03-02,EX,600,,150334,47.0,31811,EX,600,2004-03-11
8,208.0,37227,2005-03-02,EX,600,,150335,47.0,16085,EX,600,2004-03-11
9,208.0,37227,2005-03-02,EX,600,,150336,54.0,29516,EX,600,2004-03-11
