In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm, tqdm_notebook

from data.models import Officer, OfficerBadgeNumber
from data_importer.base.storage import AzureStorage
from data_importer.base.cleaner import DataCleaner, strip, titleize, to_int, ZFill

In [2]:
storage = AzureStorage(account_name='cpdbv2data', 
                       account_key='your-key-here',
                       share='cleaned-data')
import_db = 'import'

## Read and clean current star data

In [3]:
df_final_profiles = pd.read_csv(
    storage.path_for('profiles/final-profiles.csv'),
    usecols=['UID', 'Current.Star']
)
df_final_profiles = df_final_profiles.rename(columns={
    'UID': 'officer_id',
    'Current.Star': 'current_star'
})

DataCleaner(
    schema={
        'current_star': [to_int, ZFill()]
    }
).perform(df_final_profiles)

100%|██████████| 1/1 [00:00<00:00,  1.22it/s]


Unnamed: 0,officer_id,current_star
0,1,1424
1,2,20373
2,3,
3,4,
4,5,
5,6,18601
6,7,19596
7,8,
8,9,
9,10,


## Read and clean all stars data

In [4]:
df_profiles = pd.read_csv(
    storage.path_for('profiles/officer-profiles.csv'),
    usecols=['UID', 'Star1', 'Star2', 'Star3', 'Star4', 'Star5', 'Star6', 'Star7', 'Star8', 'Star9', 'Star10'],
    na_values=[-999]
)

df_profiles = df_profiles.rename(columns={
    'UID': 'officer_id',
    'Star1': 'star1', 
    'Star2': 'star2', 
    'Star3': 'star3', 
    'Star4': 'star4', 
    'Star5': 'star5', 
    'Star6': 'star6', 
    'Star7': 'star7', 
    'Star8': 'star8', 
    'Star9': 'star9', 
    'Star10': 'star10'
})

DataCleaner(
    schema={
        'star1': [to_int, ZFill()], 
        'star2': [to_int, ZFill()], 
        'star3': [to_int, ZFill()], 
        'star4': [to_int, ZFill()], 
        'star5': [to_int, ZFill()], 
        'star6': [to_int, ZFill()], 
        'star7': [to_int, ZFill()], 
        'star8': [to_int, ZFill()], 
        'star9': [to_int, ZFill()], 
        'star10': [to_int, ZFill()]
    }
).perform(df_profiles)

100%|██████████| 10/10 [00:07<00:00,  1.32it/s]


Unnamed: 0,officer_id,star3,star4,star7,star8,star9,star2,star1,star5,star6,star10
0,1,,,,,,,13001,,,
1,2,,,,,,13705,17545,,,
2,3,2628,,,,,3023,14535,,,
3,4,,,,,,,18601,,,
4,5,,,,,,,20484,,,
5,6,,,,,,,,,,
6,7,,,,,,,19596,,,
7,8,,,,,,,6799,,,
8,9,,,,,,,,,,
9,10,,,,,,,16511,,,


In [5]:
df_final_profiles = df_final_profiles.iloc[0:32140]

In [6]:
df = pd.merge(df_final_profiles, df_profiles, on='officer_id')

## Build badge number set

In [7]:
tqdm.pandas(tqdm_notebook(), desc='Build badgenumber_set')

df['badgenumber_set'] = df.progress_apply(lambda x: {star for star in [
    x['star1'], x['star2'], x['star3'], x['star4'], x['star5'], 
    x['star6'], x['star7'], x['star8'], x['star9'], x['star10'], x['current_star']
] if pd.notnull(star)}, axis=1)

df = df.drop(['star1', 'star2', 'star3', 'star4', 'star5', 'star6', 'star7', 'star8', 'star9', 'star10'], axis=1)

A Jupyter Widget

Build badgenumber_set: 32140it [00:02, 12613.27it/s]         


## Import badge number data

#### Delete all existing badge numbers before importing

In [8]:
OfficerBadgeNumber.objects.using(import_db).delete()

(0, {u'data.OfficerBadgeNumber': 0})

#### Import and print out the imported data

In [9]:
def import_badgenumber(data):
    OfficerBadgeNumber.objects.using(import_db).bulk_create([
        OfficerBadgeNumber(
            officer_id=data['officer_id'],
            star=star,
            current=star == data['current_star']
        )
        for star in data['badgenumber_set']
    ])

In [10]:
tqdm.pandas(tqdm_notebook(), desc='Importing badge number')
df.progress_apply(import_badgenumber, axis=1)

pd.DataFrame.from_records(OfficerBadgeNumber.objects.using(import_db).all().values())

A Jupyter Widget

Importing badge number: 107it [00:00, 1064.13it/s]           




Importing badge number: 32140it [00:31, 1017.38it/s]


Unnamed: 0,current,id,officer_id,star
0,False,1,1,13001
1,True,2,1,1424
2,True,3,2,20373
3,False,4,2,17545
4,False,5,2,13705
5,False,6,3,14535
6,False,7,3,2628
7,False,8,3,3023
8,False,9,4,18601
9,False,10,5,20484
