In [1]:
import pandas as pd
from tqdm import tqdm, tqdm_notebook, tqdm_pandas

from data.models import Award, Officer
from data_importer.base.storage import AzureStorage
from data_importer.base.cleaner import DataCleaner, strip, titleize, to_int, ToNaN

In [2]:
storage = AzureStorage(account_name='cpdbv2data', 
                       account_key='your-key-here',
                       share='cleaned-data')

import_db = 'import'

## Read Award data

In [3]:
df = pd.read_csv(storage.path_for('awards/awards.csv')).loc[
    :, ['Award.Type', 'Incident.Start.Date', 'Current.Award.Status', 'Award.Request.Date', 'Incident.End.Date', 
    'Rank', 'Last.Promotion.Date', 'Requester.Full.Name', 'Ceremony.Date', 'Tracking.NO','UID']
]

df = df.rename(columns={
    'Award.ID': 'id',
    'Award.Type': 'type',
    'Incident.Start.Date': 'incident_start_date',
    'Current.Award.Status': 'current_status',
    'Award.Request.Date': 'request_date',
    'Incident.End.Date': 'incident_end_date',
    'Rank': 'rank',
    'Last.Promotion.Date': 'last_promotion_date',
    'Requester.Full.Name': 'requester_full_name',
    'Ceremony.Date': 'ceremony_date',
    'Tracking.NO': 'tracking_no',
    'UID': 'officer_id'
})

## Clean Award data

Do the following steps:
* Change `requester_full_name` from `,` to `NaN`
* Change `tracking_no` from `-999` to `NaN`
* Replace all `NaN` with `None`

In [4]:
cleaner = DataCleaner(
    schema={
        'type': [strip, titleize],
        'rank': [strip, titleize],
        'current_status': [strip, titleize],
        'requester_full_name': [strip, titleize, ToNaN(value_set=[','])],
        'tracking_no': [strip, to_int, ToNaN(value_set=[-999])],
        
    }
)
cleaner.perform(df)

df = df.where((pd.notnull(df)), None)
df

A Jupyter Widget

Clean data: 100%|██████████| 557899/557899 [00:01<00:00, 362827.63it/s]
Clean data: 100%|██████████| 557899/557899 [00:03<00:00, 177066.45it/s]
Clean data: 100%|██████████| 557899/557899 [00:02<00:00, 238581.20it/s]
Clean data: 100%|██████████| 557899/557899 [00:03<00:00, 172638.69it/s]
Clean data: 100%|██████████| 557899/557899 [00:03<00:00, 166932.93it/s]


Unnamed: 0,type,incident_start_date,current_status,request_date,incident_end_date,rank,last_promotion_date,requester_full_name,ceremony_date,tracking_no,officer_id
0,Unit Meritorious Performance Award,2010-01-01,Denied,2012-11-02,2010-12-31,9171-Sergeant Of Police,,"Pavon Jr, Ricardo",,,1
1,Department Commendation,2014-06-27,Denied,2014-06-29,2014-06-27,9171-Sergeant Of Police,,"Henkels, Adam",,,1
2,Honorable Mention,2014-06-27,Denied,2014-07-04,2014-06-27,9171-Sergeant Of Police,,"Henkels, Adam",,,1
3,Honorable Mention,2006-05-11,Final,2006-06-07,2006-05-11,9171-Sergeant Of Police,,"Stinites, Alexander",,,1
4,Honorable Mention,2006-12-02,Final,2007-01-31,2006-12-02,9171-Sergeant Of Police,,"Crawford, Patrick",,,1
5,Honorable Mention,2006-11-09,Final,2007-02-25,2006-11-09,9171-Sergeant Of Police,,"Schachelmayer, Joseph",,,1
6,Honorable Mention,2007-06-13,Final,2007-06-14,2007-06-14,9171-Sergeant Of Police,,"Giambrone, Joseph",,,1
7,Department Commendation,2007-01-15,Final,2007-07-05,2007-01-16,9171-Sergeant Of Police,,"Giambrone, Joseph",2009-11-27,,1
8,Complimentary Letter,2007-06-10,Final,2007-10-24,2007-06-14,9171-Sergeant Of Police,,"Clausell, Ella",,,1
9,Honorable Mention,2007-11-26,Final,2008-01-11,2007-11-26,9171-Sergeant Of Police,,"Brand, Thomas",,,1


## Import Award data

#### Exclude all awards which have no corresponding officer

In [5]:
df = df[~df['officer_id'].isnull()]

#### Delete all existing Awards

In [6]:
Award.objects.using(import_db).delete()

(528305, {u'data.Award': 528305})

#### Import Award data

In [7]:
tqdm.pandas(tqdm_notebook(), desc='Import Award')
df.progress_apply(lambda x: Award.objects.using(import_db).create(**x), axis=1)

A Jupyter Widget

Import Award:  18%|█▊        | 2/11 [00:00<00:00, 19.11it/s]




Import Award: 528305it [10:23, 847.60it/s]                  


0         Award object
1         Award object
2         Award object
3         Award object
4         Award object
5         Award object
6         Award object
7         Award object
8         Award object
9         Award object
10        Award object
11        Award object
12        Award object
13        Award object
14        Award object
15        Award object
16        Award object
17        Award object
18        Award object
19        Award object
20        Award object
21        Award object
22        Award object
23        Award object
24        Award object
25        Award object
26        Award object
27        Award object
28        Award object
29        Award object
              ...     
557869    Award object
557870    Award object
557871    Award object
557872    Award object
557873    Award object
557874    Award object
557875    Award object
557876    Award object
557877    Award object
557878    Award object
557879    Award object
557880    Award object
557881    A

#### Print out the imported data

In [8]:
pd.DataFrame.from_records(Award.objects.using(import_db).all().values())

Unnamed: 0,ceremony_date,current_status,id,incident_end_date,incident_start_date,last_promotion_date,officer_id,rank,request_date,requester_full_name,tracking_no,type
0,,Final,3284914,2013-08-20,2013-08-20,2000-09-01,4176,9165-Po As Detective,2013-08-22,"Winstrom, Eric",,Honorable Mention
1,,Final,3284915,2013-09-21,2013-09-21,2000-09-01,4176,9165-Po As Detective,2013-12-26,"Winstrom, Eric",,Honorable Mention
2,,Final,3284916,2013-08-19,2013-08-19,2000-09-01,4176,9165-Po As Detective,2013-11-04,"Winstrom, Eric",,Honorable Mention
3,,Final,3284917,2013-10-21,2013-10-21,2000-09-01,4176,9165-Po As Detective,2014-02-04,"Winstrom, Eric",,Honorable Mention
4,,Final,3284918,2013-10-24,2013-10-24,2000-09-01,4176,9165-Po As Detective,2014-01-30,"Winstrom, Eric",,Honorable Mention
5,,Final,3284919,2013-11-12,2013-11-12,2000-09-01,4176,9165-Po As Detective,2014-02-04,"Winstrom, Eric",,Honorable Mention
6,,Final,3284920,2013-11-12,2013-11-12,2000-09-01,4176,9165-Po As Detective,2014-01-23,"Winstrom, Eric",,Honorable Mention
7,,Final,3284921,2013-11-14,2013-11-14,2000-09-01,4176,9165-Po As Detective,2013-11-15,"Winstrom, Eric",,Honorable Mention
8,,Final,3284922,,2013-10-29,2000-09-01,4176,9165-Po As Detective,2013-11-19,,,Emblem Of Recognition - Physical Fitness
9,,Final,3284923,2013-12-10,2013-12-10,2000-09-01,4176,9165-Po As Detective,2014-06-04,"Winstrom, Eric",,Honorable Mention
