In [1]:
from dateutil.tz import tzutc
from dateutil.parser import parse

import pandas as pd
import numpy as np
from tqdm import tqdm, tqdm_notebook, tqdm_pandas

from data.models import Allegation, Area
from data_importer.base.storage import AzureStorage

In [2]:
storage = AzureStorage(account_name='cpdbv2data', 
                       account_key='your-key-here',
                       share='cleaned-data')
import_db = 'import'

## Read Complaints data

In [3]:
df = pd.read_csv(
    storage.path_for('complaints/complaints.csv'),
    usecols=['CRID', 'Location.Code', 'Street', 'Address.Number', 'City.State', 'Incident.Date', 'Incident.Time'],
    parse_dates={'Incident.DateTime': ['Incident.Date', 'Incident.Time']},
    date_parser=lambda x: parse(x, tzinfos=tzutc),
    na_values=['-----', '----']
)
df = df.rename(columns={
    'CRID': 'crid',
    'Location.Code': 'location',
    'Street': 'add2',
    'Address.Number': 'add1',
    'City.State': 'city',
    'Incident.DateTime': 'incident_date'
})

#### There are some records in this data are duplicated. So we need to remove duplicated records

In [4]:
df = df.drop_duplicates()

#### Add Investigator id

In [5]:
def get_investigator_id(crid):
    df = df_investigators[df_investigators['CRID']==crid]
    if df.empty:
        return np.NaN
    return df.iloc[0]['investigators_ID']

In [6]:
tqdm.pandas(tqdm_notebook(), desc='Adding investigator id')
df_investigators = pd.read_csv(
    storage.path_for('complaints/investigators.csv'),
    usecols=['investigators_ID', 'CRID']
)
df['investigator_id'] = df['crid'].progress_apply(get_investigator_id)

A Jupyter Widget

Adding investigator id: 100%|██████████| 109339/109339 [01:04<00:00, 1701.21it/s]


## Clean Complaints data

#### Do following clean steps
* Replace all `NaN` with `None`
* add2/city/location: Replace `None` with `''`

In [7]:
df = df.where(pd.notnull(df), None)
df['add2'] = df['add2'].apply(lambda x: x if x is not None else '')
df['city'] = df['city'].apply(lambda x: x if x is not None else '')
df['location'] = df['location'].apply(lambda x: x if x is not None else '')

## Import Complaints data

#### Delete all existing Allegations before importing

In [8]:
Allegation.objects.using(import_db).delete()

(0, {})

#### Import and print out the imported data

In [9]:
tqdm.pandas(tqdm_notebook(), desc='Importing Allegation data')
df.progress_apply(lambda x: Allegation.objects.using(import_db).create(**x), axis=1)

pd.DataFrame.from_records(Allegation.objects.using(import_db).all().values())

A Jupyter Widget

Importing Allegation data:   0%|          | 0/7 [00:00<?, ?it/s]




Importing Allegation data: 109339it [02:33, 712.61it/s]                 


Unnamed: 0,add1,add2,beat_id,city,crid,id,incident_date,investigator_id,location,point,source,summary
0,5327.0,W CHICAGO,,CHICAGO IL,258996,1,2000-01-01 01:20:00+00:00,1.0,04,,,
1,4316.0,W JACKSON,,CHICAGO IL,258997,2,2000-01-01 01:30:00+00:00,2.0,17,,,
2,500.0,W ILLINOIS,,CHICAGO IL,258998,3,2000-01-01 00:28:00+00:00,3.0,17,,,
3,,,,CHICAGO IL,258999,4,2000-01-01 03:30:00+00:00,4.0,17,,,
4,5327.0,W CHICAGO AVE,,CHICAGO IL,259000,5,2000-01-01 05:00:00+00:00,5.0,04,,,
5,2940.0,E 87TH ST,,CHICAGO IL,259001,6,2000-01-01 04:39:00+00:00,6.0,17,,,
6,8726.0,S MARSHFIELD,,CHICAGO IL,259002,7,2000-01-01 01:15:00+00:00,7.0,17,,,
7,,,,CHICAGO IL,259003,8,2000-01-01 12:01:00+00:00,8.0,04,,,
8,,,,CHICAGO IL,259005,9,2000-01-01 00:00:00+00:00,9.0,14,,,
9,,,,CHICAGO IL,259006,10,2000-01-01 14:30:00+00:00,10.0,07,,,


*Note: Geo data (area/line_area/beat/point) will be imported whenever the data is avaiable*