In [1]:
import pandas as pd
from tqdm import tqdm, tqdm_notebook, tqdm_pandas

from data.models import Officer
from data_importer.base.storage import AzureStorage
from data_importer.base.cleaner import DataCleaner, strip, titleize
from data_importer.base.utils.import_officer_history_helpers import import_officer_history

In [2]:
storage = AzureStorage(account_name='cpdbv2data', 
                       account_key='<add_your_key_here>',
                       share='cleaned-data')

import_db = 'import'

## Read Unit History Data

In [3]:
df_all_sworn_units = pd.read_csv(storage.path_for('unit-history/all-sworn-units.csv'))
df_unit_history = pd.read_csv(storage.path_for('unit-history/unit-history.csv'))

In [4]:
df_all_sworn_units = df_all_sworn_units.loc[:, ['UID', 'Unit', 'Start.Date', 'End.Date']]
df_unit_history = df_unit_history.loc[:, ['UID', 'Unit', 'Start.Date', 'End.Date']]

df_all_sworn_units = df_all_sworn_units.rename(columns={
    'UID': 'officer_id',
    'Unit': 'unit',
    'Start.Date': 'start_date',
    'End.Date': 'end_date'
})

df_unit_history = df_unit_history.rename(columns={
    'UID': 'officer_id',
    'Unit': 'unit',
    'Start.Date': 'start_date',
    'End.Date': 'end_date'
})


## Clean Unit History data

#### Concat 2 data files and drop duplicated rows

In [5]:
df = pd.concat([df_unit_history, df_all_sworn_units])\
        .drop_duplicates(['officer_id', 'unit', 'start_date', 'end_date'])\
        .sort_values(['officer_id', 'unit'])

#### Do following clean steps
- Replace NaN value to None
- Drop all misinfomation rows which has empty unit, start_date and end_date
- Drop invalid rows which has start_date > end_date
- Drop duplicated rows

In [6]:
df = df.where((pd.notnull(df)), None)
df = df[~((df['unit']==-999) & (df['start_date'].isnull()) & (df['end_date'].isnull()))]
df = df[~(df['start_date'] > df['end_date'])]

** Sort data by `officer_id`, `unit`, `end_date`, `start_date` accordingly **

In [7]:
df = df.sort_values(['officer_id', 'unit', 'end_date', 'start_date'])

## Import and print out the output

In [8]:
tqdm.pandas(tqdm_notebook(), desc='Import Officer History')
df.progress_apply(
    lambda x: import_officer_history(x, import_db), 
    axis=1
)

pd.DataFrame.from_records(OfficerHistory.objects.using(import_db).all().values(
    'id', 'officer_id','officer__first_name', 'officer__last_name', 'unit__unit_name', 'effective_date', 'end_date'
))

A Jupyter Widget

Import Officer History: 220769it [13:35, 270.69it/s]         


Unnamed: 0,effective_date,end_date,id,officer__first_name,officer__last_name,officer_id,unit__unit_name
0,2016-06-05,,1,Jeffery,Aaron,1,003
1,,2007-03-28,2,Jeffery,Aaron,1,014
2,2007-03-29,2016-06-04,3,Jeffery,Aaron,1,014
3,,2007-03-28,4,Karina,Aaron,2,015
4,2007-03-29,,5,Karina,Aaron,2,015
5,1976-08-19,1976-08-20,6,Daniel,Abate,3,001
6,1971-01-16,1976-08-18,7,Daniel,Abate,3,004
7,1970-06-15,1971-01-15,8,Daniel,Abate,3,022
8,1976-08-21,,9,Daniel,Abate,3,543
9,1996-09-12,2005-05-25,10,Anthony,Abbate,4,011
