In [89]:
import json

from django.contrib.gis.geos import Point

import geocoder
import pandas as pd
from tqdm import tqdm, tqdm_notebook, tqdm_pandas

from data.models import Allegation, Area
from data_importer.base.storage import AzureStorage
from data_importer.base.cleaner import DataCleaner, strip, remove_whitespace, ZFill

In [2]:
storage = AzureStorage(account_name='cpdbv2data', 
                       account_key='your-key-here',
                       share='cleaned-data')
import_db = 'import'
MAP_BOX_API_KEY = "pk.eyJ1IjoiaW52aXNpYmxlaW5zdGl0dXRlIiwiYSI6ImNpZ256bXRqMDAwMDBzeGtud3VoZGplNHMifQ.ky2VSGEYU5KritRMArHY-w"

In [3]:
crid_to_address_map = {}
for cr in Allegation.objects.using(import_db).all():
    crid_to_address_map[cr.crid] = cr.address

## Read and clean Complaints data

In [4]:
df = pd.read_csv(
    '/complaints/complaints.csv',
    usecols=['CRID', 'Beat']
)

df = DataCleaner(schema={
    'CRID': [strip],
    'Beat': [strip, remove_whitespace, ZFill(4)]
}).perform(df)

df = df.drop_duplicates()

  interactivity=interactivity, compiler=compiler, result=result)
100%|██████████| 2/2 [00:06<00:00,  2.63s/it]


#### Add address field

In [5]:
df = df.assign(address=df['CRID'].apply(lambda x: crid_to_address_map[x]))

## Geocoding address

In [6]:
def get_centroid(beat):
    if pd.isnull(beat):
        return None
    beat = Area.objects.using(import_db).filter(area_type='beat', name=beat).first()
    return beat.polygon.centroid if beat is not None else None

In [7]:
address_cache = {}

def get_address_mapbox_api(address, proximity=None, key=MAP_BOX_API_KEY):
    if not proximity:
        if address in address_cache:
            return address_cache[address]

        address_cache[address] = geocoder.mapbox(address, key=key).latlng
        return address_cache[address]
    else:
        return geocoder.mapbox(address, proximity=proximity, key=key).latlng

def geocode(address, beat):
    centroid = get_centroid(beat)
    if address == '':
        return centroid
    proximity = [centroid.y, centroid.x] if centroid is not None else None
    
    return get_address_mapbox_api(address, proximity=proximity)

In [8]:
tqdm.pandas(tqdm_notebook(), desc='Geocoding address')
df = df.assign(geo=df.progress_apply(lambda x: geocode(x['address'], x['Beat']), axis=1))

A Jupyter Widget

Geocoding address: 100%|██████████| 109339/109339 [8:13:35<00:00,  3.79it/s]  


#### Add latitude and longtitude fields

In [70]:
df = df.assign(
    lat=df.apply(lambda x: None if not x['geo'] else x['geo'][0 if x['address'] != '' else 1], axis=1),
    lng=df.apply(lambda x: None if not x['geo'] else x['geo'][1 if x['address'] != '' else 0], axis=1),
)

#### Write the geocoded data to csv file

In [88]:
df.to_csv('allegation_areas.csv')