In [10]:
import pandas as pd
from tqdm import tqdm

from data.models import Officer
from data_importer.base.storage import AzureStorage
from data_importer.base.cleaner import DataCleaner, strip, titleize

In [11]:
storage = AzureStorage(account_name='cpdbv2data', 
                       account_key='azure_storage_account_key',
                       share='cleaned-data')

## Read Officer Profiles data

In [12]:
df = pd.read_csv(storage.path_for('profiles/officer-profiles.csv'))
df = df.loc[:, 
    ['UID', 'Gender', 'Appointed.Date', 'Race', 'Birth.Year', 
     'Last.Name', 'First.Name', 'Suffix.Name', 'Middle.Initial']
]
df = df.rename(columns={
    'UID': 'id',
    'Gender': 'gender', 
    'Appointed.Date': 'appointed_date', 
    'Race': 'race', 
    'Birth.Year': 'birth_year',
    'Last.Name': 'last_name',
    'First.Name': 'first_name',
    'Suffix.Name': 'suffix_name',
    'Middle.Initial': 'middle_initial'
})

## Clean up the data

In [13]:
cleaner = DataCleaner(
    schema={
        'first_name': [strip, titleize],
        'last_name': [strip, titleize],
        'gender': [strip, titleize],
        'race': [strip],
    }
)
cleaner.perform(df)

100%|██████████| 4/4 [00:02<00:00,  1.39it/s]


Unnamed: 0,id,gender,appointed_date,race,birth_year,last_name,first_name,suffix_name,middle_initial
0,1,Male,2005-09-26,WHITE,1971.0,Aaron,Jeffery,,M
1,2,Female,2005-09-26,HISPANIC,1980.0,Aaron,Karina,,
2,3,Male,1970-06-15,WHITE,1942.0,Abate,Daniel,,P
3,4,Male,1994-12-05,WHITE,1968.0,Abbate,Anthony,,G
4,5,Male,1969-01-06,WHITE,1942.0,Abbate,Carmel,,G
5,6,Male,1954-10-16,WHITE,1930.0,Abbate,Carmen,,S
6,7,Male,1995-12-04,WHITE,1972.0,Abbate,Terry,,M
7,8,Male,1990-03-26,BLACK,1945.0,Abbey,Leon,,B
8,9,Male,1949-03-16,BLACK,1923.0,Abbey,Leon,,
9,10,Male,1976-04-26,BLACK,1952.0,Abbey,Michael,,J


#### Fill all missing values with None

In [14]:
df = df.where((pd.notnull(df)), None)

#### Change gender format to M/F/X

In [15]:
df['gender'] = df['gender'].apply(lambda x: 'M' if x == 'Male' else 'F' if x == 'Female' else 'X' if x == 'Unknown' else '')
df

Unnamed: 0,id,gender,appointed_date,race,birth_year,last_name,first_name,suffix_name,middle_initial
0,1,M,2005-09-26,WHITE,1971,Aaron,Jeffery,,M
1,2,F,2005-09-26,HISPANIC,1980,Aaron,Karina,,
2,3,M,1970-06-15,WHITE,1942,Abate,Daniel,,P
3,4,M,1994-12-05,WHITE,1968,Abbate,Anthony,,G
4,5,M,1969-01-06,WHITE,1942,Abbate,Carmel,,G
5,6,M,1954-10-16,WHITE,1930,Abbate,Carmen,,S
6,7,M,1995-12-04,WHITE,1972,Abbate,Terry,,M
7,8,M,1990-03-26,BLACK,1945,Abbey,Leon,,B
8,9,M,1949-03-16,BLACK,1923,Abbey,Leon,,
9,10,M,1976-04-26,BLACK,1952,Abbey,Michael,,J


#### Replace None with `''`

In [16]:
df['race'] = df['race'].apply(lambda x: x if x is not None else '')
df

Unnamed: 0,id,gender,appointed_date,race,birth_year,last_name,first_name,suffix_name,middle_initial
0,1,M,2005-09-26,WHITE,1971,Aaron,Jeffery,,M
1,2,F,2005-09-26,HISPANIC,1980,Aaron,Karina,,
2,3,M,1970-06-15,WHITE,1942,Abate,Daniel,,P
3,4,M,1994-12-05,WHITE,1968,Abbate,Anthony,,G
4,5,M,1969-01-06,WHITE,1942,Abbate,Carmel,,G
5,6,M,1954-10-16,WHITE,1930,Abbate,Carmen,,S
6,7,M,1995-12-04,WHITE,1972,Abbate,Terry,,M
7,8,M,1990-03-26,BLACK,1945,Abbey,Leon,,B
8,9,M,1949-03-16,BLACK,1923,Abbey,Leon,,
9,10,M,1976-04-26,BLACK,1952,Abbey,Michael,,J


## Import officer profiles

#### Delete all existing officers before importing

In [17]:
import_db = 'import'
Officer.objects.using(import_db).delete()

(0, {})

In [18]:
officer_dicts = df.to_dict(orient='records')

for officer_dict in tqdm(officer_dicts):
    officer = Officer(**officer_dict)
    officer.save(using=import_db)

Officer.objects.using(import_db).count()

100%|██████████| 32140/32140 [00:46<00:00, 693.50it/s]


32140

## Print out the imported data

In [19]:
df_imported_officers = pd.DataFrame.from_records(Officer.objects.using(import_db).all().values()).sort_values('id')
df_imported_officers

Unnamed: 0,active,appointed_date,birth_year,first_name,gender,id,last_name,middle_initial,race,rank,suffix_name,tags
0,Unknown,2005-09-26,1971.0,Jeffery,M,1,Aaron,M,WHITE,,,[]
1,Unknown,2005-09-26,1980.0,Karina,F,2,Aaron,,HISPANIC,,,[]
2,Unknown,1970-06-15,1942.0,Daniel,M,3,Abate,P,WHITE,,,[]
3,Unknown,1994-12-05,1968.0,Anthony,M,4,Abbate,G,WHITE,,,[]
4,Unknown,1969-01-06,1942.0,Carmel,M,5,Abbate,G,WHITE,,,[]
5,Unknown,1954-10-16,1930.0,Carmen,M,6,Abbate,S,WHITE,,,[]
6,Unknown,1995-12-04,1972.0,Terry,M,7,Abbate,M,WHITE,,,[]
7,Unknown,1990-03-26,1945.0,Leon,M,8,Abbey,B,BLACK,,,[]
8,Unknown,1949-03-16,1923.0,Leon,M,9,Abbey,,BLACK,,,[]
9,Unknown,1976-04-26,1952.0,Michael,M,10,Abbey,J,BLACK,,,[]


Currently, `active` and `rank` are still missing. We will update when this data is available