In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm, tqdm_notebook, tqdm_pandas

from data.models import Investigator, PoliceUnit
from data_importer.base.storage import AzureStorage
from data_importer.base.cleaner import DataCleaner, strip, titleize, ZFill

In [2]:
storage = AzureStorage(account_name='cpdbv2data', 
                       account_key='your-key-here',
                       share='cleaned-data')

import_db = 'import'

## Read Investigator data

In [3]:
df = pd.read_csv(storage.path_for('complaints/investigators.csv')).loc[:, ['investigators_ID', 'Assignment', 'Current.Rank']]

df = df.rename(columns={
    'investigators_ID': 'id',
    'Assignment': 'unit',
    'Current.Rank': 'current_rank'
})

df = df.drop_duplicates(['id'])

## Clean Investigator data

In [4]:
cleaner = DataCleaner(
    schema={
        'current_rank': [strip, titleize],
        'unit': [strip, ZFill(padding=3)]
    }
)
cleaner.perform(df)

100%|██████████| 2/2 [00:00<00:00,  9.04it/s]


Unnamed: 0,id,unit,current_rank
0,1,013,Lieutenant Of Police
1,2,008,Sergeant Of Police
2,3,608,Lieutenant Of Police
3,4,113,Supervising Inv Ipra
4,5,020,Sergeant Of Police
5,6,001,Sergeant Of Police
6,7,113,Investigator 2 Ipra
7,8,113,Investigator 3 Ipra
8,9,120,Chief
9,10,113,Investigator 3 Ipra


#### Add `unit_id` field and drop `unit` field
*For each `unit`, get the corresponding `unit_id` from the imported `PoliceUnit` data*

In [5]:
df_units = pd.DataFrame.from_records(PoliceUnit.objects.using(import_db).all().values())

In [6]:
def to_unit_id(unit):
    if not unit:
        return np.NaN
    df_found_units = df_units[df_units['unit_name']==unit]
    if df_found_units.empty:
        return np.NaN
    return df_found_units.iloc[0]['id']

In [7]:
tqdm.pandas(tqdm_notebook(), desc='Get unit id')
df['unit_id'] = df['unit'].progress_apply(to_unit_id)
del df['unit']

A Jupyter Widget

Get unit id: 100%|██████████| 3198/3198 [00:01<00:00, 1622.22it/s]


#### Replace all `NaN` to `None`

In [8]:
df = df.where((pd.notnull(df)), None)

## Import Investigator data

#### Delete existing investigators before importing

In [9]:
Investigator.objects.using(import_db).delete()

(3198, {u'data.Investigator': 3198})

#### Import and Print out Investigator data

In [10]:
tqdm.pandas(tqdm_notebook(), desc='Import Investigator')
df.progress_apply(lambda x: Investigator.objects.using(import_db).create(**x), axis=1)

pd.DataFrame.from_records(Investigator.objects.using(import_db).all().values())

A Jupyter Widget

Import Investigator: 105it [00:00, 1043.73it/s]           




Import Investigator: 3198it [00:02, 1089.63it/s]


Unnamed: 0,current_rank,id,name,raw_name,unit_id
0,Lieutenant Of Police,1,,,14.0
1,Sergeant Of Police,2,,,9.0
2,Lieutenant Of Police,3,,,201.0
3,Sergeant Of Police,3031,,,13.0
4,Sergeant Of Police,3032,,,20.0
5,Sergeant Of Police,3033,,,62.0
6,Sergeant Of Police,3034,,,26.0
7,Sergeant Of Police,3035,,,7.0
8,Sergeant Of Police,3036,,,210.0
9,Sergeant Of Police,3037,,,19.0
