In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm, tqdm_notebook, tqdm_pandas

from data.models import Complainant, Allegation
from data_importer.base.cleaner import DataCleaner, strip, titleize
from data_importer.base.storage import AzureStorage

In [2]:
storage = AzureStorage(account_name='cpdbv2data', 
                       account_key='your-key-here',
                       share='cleaned-data')
import_db = 'import'

In [3]:
gender_dict = {
    'MALE': 'M',
    'FEMALE': 'F',
    'UNKNOWN': 'X'
}

## Read Complainants data

In [4]:
df = pd.read_csv(storage.path_for('complaints/complainants.csv'))

df = df.rename(columns={
    'CRID': 'crid',
    'Gender': 'gender',
    'Age': 'age',
    'Race': 'race'
})

## Clean Complaints data

In [5]:
cleaner = DataCleaner(
    schema={
        'race': [strip, titleize],
        'gender': [lambda x: gender_dict.get(x, '')],
        'age': [lambda x: round(x, 0) if x != None else None],
        'crid': [strip]
    }
)
cleaner.perform(df)

100%|██████████| 4/4 [00:02<00:00,  1.30it/s]


Unnamed: 0,crid,gender,age,race
0,1000000,F,49.0,White
1,1000001,M,32.0,Black
2,1000002,M,43.0,Hispanic
3,1000004,M,60.0,Hispanic
4,1000004,M,53.0,White
5,1000005,F,35.0,Black
6,1000006,M,53.0,Black
7,1000007,F,56.0,White
8,1000009,M,39.0,Hispanic
9,1000010,M,43.0,Black


#### Get the allegation's ids which complainants are associated with

In [6]:
crid_to_id_map = {}
for id, crid in Allegation.objects.using(import_db).all().values_list('id', 'crid'):
    crid_to_id_map[crid] = id

In [7]:
tqdm.pandas(tqdm_notebook(), desc='Get allegation id')
df['allegation_id'] = df['crid'].progress_apply(lambda x: crid_to_id_map.get(x, None))
del df['crid']

A Jupyter Widget

Get allegation id: 100%|██████████| 48214/48214 [00:00<00:00, 887134.89it/s]


#### Drop all complainants with invalid age

In [8]:
df = df[df['age'] > 0]

## Import Complainants data

#### Delete existing complainants before importing

In [9]:
Complainant.objects.using(import_db).delete()

(0, {u'data.Complainant': 0})

#### Import complainants data and print out the imported data

In [10]:
tqdm.pandas(tqdm_notebook(), desc='Importing complainant data')
df.progress_apply(lambda x: Complainant.objects.using(import_db).create(**x), axis=1)

pd.DataFrame.from_records(Complainant.objects.using(import_db).all().values())

A Jupyter Widget

Importing complainant data: 106it [00:00, 1052.21it/s]           




Importing complainant data: 40831it [00:41, 986.00it/s] 


Unnamed: 0,age,allegation_id,gender,id,race
0,49,56650,F,1,White
1,32,56651,M,2,Black
2,43,56652,M,3,Hispanic
3,60,56653,M,4,Hispanic
4,53,56653,M,5,White
5,35,56654,F,6,Black
6,53,56655,M,7,Black
7,56,56656,F,8,White
8,39,56657,M,9,Hispanic
9,43,56658,M,10,Black


*Currently, we're exclusing the complaints who have invalid age (negative numbers). We will check and import them when they're cleaned enough later.*