In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm, tqdm_notebook, tqdm_pandas

from data.models import Victim, Allegation
from data_importer.base.cleaner import DataCleaner, strip, titleize
from data_importer.base.storage import AzureStorage

In [2]:
storage = AzureStorage(account_name='cpdbv2data', 
                       account_key='your-key-here',
                       share='cleaned-data')
import_db = 'import'

In [3]:
gender_dict = {
    'MALE': 'M',
    'FEMALE': 'F',
    'UNKNOWN': 'X'
}

## Read Victim data

In [4]:
df = pd.read_csv(storage.path_for('complaints/victims.csv'))

df = df.rename(columns={
    'CRID': 'crid',
    'Gender': 'gender',
    'Age': 'age',
    'Race': 'race'
})

## Clean Victim data

In [5]:
cleaner = DataCleaner(
    schema={
        'race': [strip, titleize],
        'gender': [lambda x: gender_dict.get(x, '')],
        'age': [lambda x: round(x, 0) if x != None else None],
        'crid': [strip]
    }
)
cleaner.perform(df)

100%|██████████| 4/4 [00:01<00:00,  2.90it/s]


Unnamed: 0,crid,gender,age,race
0,1000009,F,37.0,Hispanic
1,1000015,F,35.0,Black
2,1000015,F,24.0,Black
3,1000020,M,28.0,Black
4,1000021,M,26.0,Black
5,1000027,F,,Black
6,1000034,M,27.0,Black
7,1000034,M,28.0,Black
8,1000043,M,29.0,Black
9,1000043,M,36.0,Black


#### Get the allegation's ids which complainants are associated with

In [6]:
crid_to_id_map = {}
for id, crid in Allegation.objects.using(import_db).all().values_list('id', 'crid'):
    crid_to_id_map[crid] = id

In [7]:
tqdm.pandas(tqdm_notebook(), desc='Get allegation id')
df['allegation_id'] = df['crid'].progress_apply(lambda x: crid_to_id_map.get(x, None))
del df['crid']

A Jupyter Widget

Get allegation id: 100%|██████████| 20223/20223 [00:00<00:00, 812815.96it/s]


#### Drop all victims with invalid age

In [8]:
df = df[df['age'] > 0]

## Import Victim data

#### Delete existing victims before importing

In [9]:
Victim.objects.using(import_db).delete()

(0, {u'data.Victim': 0})

#### Import Victim data and print out the imported data

In [14]:
tqdm.pandas(tqdm_notebook(), desc='Importing Victim data')
df.progress_apply(lambda x: Victim.objects.using(import_db).create(**x), axis=1)

pd.DataFrame.from_records(Victim.objects.using(import_db).all().values())

Unnamed: 0,age,allegation_id,gender,id,race
0,37,56657,F,1,Hispanic
1,35,56660,F,2,Black
2,24,56660,F,3,Black
3,28,56664,M,4,Black
4,26,56665,M,5,Black
5,27,56676,M,6,Black
6,28,56676,M,7,Black
7,29,56683,M,8,Black
8,36,56683,M,9,Black
9,25,56692,F,10,Black


*Currently, we're exclusing the victims who have invalid age (negative numbers). We will check and import them when they're cleaned enough later.*