In [1]:
import os
import string
from glob import iglob

import pandas as pd
import requests
import grequests


def drop_fractions(s):
    return ' '.join([part for part in s.split() if not '/' in part])

TRANS_TABLE = str.maketrans({key: None for key in string.punctuation})

ImportError: No module named 'pandas'

In [2]:
# Load Ellis dataset
DATA_DIR = '/Users/mcmenamin/GitHub/la_mayors_office/data'
DATA_FILE = 'To Mayor - Ellis 7-16-2007 - 7-31-2017 Ran 8-18-2017.xlsx'

OUT_FILE = 'ellis_geocoded.csv'

df_ellis = pd.read_excel(os.path.join(DATA_DIR, DATA_FILE))
df_ellis.APN = df_ellis.APN.astype(str)
df_ellis.Zip = df_ellis.Zip.astype(str)

df_ellis.Address = df_ellis.Address.str.strip()
df_ellis.Address = df_ellis.Address.apply(drop_fractions)
df_ellis.Address = df_ellis.Address.apply(lambda x: x.translate(TRANS_TABLE).split(' CA ')[0])
df_ellis['State'] = 'CA'

In [3]:
def format_address_from_api(row):
    
    row_dict = {'APN': str(row.APN)}

    try:
        row_dict['Address Matched'] = row.address2
        row_dict['Latitude/Longitude'] = "({}, {})".format(*row.latlon.split(',')[::-1])
    except:
        row_dict['Address Matched'] = ''
        row_dict['Latitude/Longitude'] = ''
    return row_dict

In [4]:
# Break ellis into batches of 1000

_to_batches = df_ellis.rename(columns={'Address':'Street address', 'APN':'Unique ID', 'Zip':'ZIP'})
_to_batches['City'] = ''
_to_batches = _to_batches[['Unique ID', 'Street address', 'City', 'State', 'ZIP']]

batch_size = 1000
for i in range(0, _to_batches.shape[0], batch_size):
    _to_batches.iloc[i : (i + batch_size)].to_csv(
        os.path.join(DATA_DIR, 'tmp_batch_{}.csv'.format(i)),
        index=False
    )

# Use census API to batch-encode

CURL_CMD = """
    curl --form addressFile=@{file} \
          --form benchmark={benchmark} https://geocoding.geo.census.gov/geocoder/locations/addressbatch \
          --output {return_file}
          """

for f in iglob(os.path.join(DATA_DIR, 'tmp_batch_*.csv'.format(i))):
    print(f)
    _cmd = CURL_CMD.format(
        file=f,
        benchmark='Public_AR_Current',
        return_file=f.split('.csv')[0] + '_coded.csv'
    )
    
    os.system(_cmd)

# Merge geocoded results into a single useful table

df_coded = []
columns = ['APN', 'address1', 'has_match', 'is_exact_match', 'address2', 'latlon', 'tmp1', 'tmp2']
for f in iglob(os.path.join(DATA_DIR, 'tmp_batch_*_coded.csv'.format(i))):
    df_coded.append(pd.read_csv(f, names=columns))

df_coded = pd.concat(df_coded, axis=0).drop(['tmp1', 'tmp2', 'has_match', 'is_exact_match', 'address1'], 1)
df_coded = pd.DataFrame([format_address_from_api(row_data) for row_num, row_data in df_coded.iterrows()])
df_coded = df_ellis.merge(df_coded, how='left', on='APN')

df_coded = df_coded.rename(columns={'Date Filed': 'Status Date'})
df_coded['Status'] = 'Withdrawal Filed'
df_coded['Permit Type'] = 'Ellis Withdrawal Application'

df_coded.to_csv(
    os.path.join(DATA_DIR, 'Ellis_withdrawals_geocoded.csv'),
    index=False
)

/Users/mcmenamin/GitHub/la_mayors_office/data/tmp_batch_0.csv
/Users/mcmenamin/GitHub/la_mayors_office/data/tmp_batch_0_coded.csv
/Users/mcmenamin/GitHub/la_mayors_office/data/tmp_batch_1000.csv
/Users/mcmenamin/GitHub/la_mayors_office/data/tmp_batch_1000_coded.csv


In [6]:
df_coded.head()

Unnamed: 0,APN,Address,Zip,CD,Permit Issue Date,State,Address Matched,Latitude/Longitude,Status,Permit Type
0,5080031003,961 S ARDMORE AVE,90006,10.0,2007-07-18,CA,"961 S ARDMORE AVE, LOS ANGELES, CA, 90006","(34.053787, -118.30208)",Withdrawal Filed,Ellis Withdrawal Application
1,5080031004,965 S ARDMORE AVE,90006,10.0,2007-07-18,CA,"965 S ARDMORE AVE, LOS ANGELES, CA, 90006","(34.05368, -118.30208)",Withdrawal Filed,Ellis Withdrawal Application
2,2182008020,5114 N ENFIELD AVE,91316,5.0,2007-07-19,CA,"5114 ENFIELD AVE, VAN NUYS, CA, 91316","(34.161873, -118.52398)",Withdrawal Filed,Ellis Withdrawal Application
3,2203017041,7450 N LOUISE AVE,91406,6.0,2007-07-19,CA,"7450 LOUISE AVE, LAKE BALBOA, CA, 91406","(34.20612, -118.50986)",Withdrawal Filed,Ellis Withdrawal Application
4,2421030018,4603 N DENNY AVE,91602,2.0,2007-07-24,CA,"4603 DENNY AVE, W TOLUCA LAKE, CA, 91602","(34.154064, -118.36478)",Withdrawal Filed,Ellis Withdrawal Application


## This section uses the API to make individual geocoding calls

it's way slower, but it's nice because they'll break an address into components (street, number, etc)

In [None]:
# Use API individual calls

def format_data_for_api(row_num, row_data):
    #'city': 'Los Angeles',

    data = {
        'street': row_data.Address.strip(),
        'state': 'CA',
        'zip': row_data.Zip,
        'format': 'json',
        'benchmark': 'Public_AR_Current',
        'row_num': row_num
    }
    return data

def format_data_from_api(resp):
    j_data = resp.json()
    addy_data = j_data['result']['addressMatches'][0]
    in_addy_data = j_data['result']['input']['address']
    out_dict = {
        'in_street': in_addy_data['street'],
        'in_zip': in_addy_data.get('zip', 'none'),
        'Address Start': addy_data['addressComponents']['fromAddress'],
        'Address End': addy_data['addressComponents']['toAddress'],
        'Street Direction': addy_data['addressComponents']['preDirection'],
        'Street Name': addy_data['addressComponents']['streetName'],
        'Street Suffix': addy_data['addressComponents']['suffixType'],
        'Zip Code': addy_data['addressComponents'].get('zip', ''),
        'Latitude/Longitude': (addy_data['coordinates']['y'], addy_data['coordinates']['x'])
    }
    return out_dict

API_URL = "https://geocoding.geo.census.gov/geocoder/{returntype}/{searchtype}"

query_params = {
    'returntype': 'locations',
    'searchtype': 'address'
}

reqs = [grequests.get(API_URL.format(**query_params), params=format_data_for_api(row_num, row_data))
        for row_num, row_data in df_ellis.iterrows()]

resps = grequests.map(reqs, size=10)

resp_dicts = []
for r in resps:
    if r.status_code==200:
        new_dict = None
        try:
            new_dict = format_data_from_api(r)
        except IndexError:
            print('index error: {}'.format(r.json()['result']['input']['address']['street']))
            r_redo = requests.get(
                API_URL.format(**query_params),
                params={'state': 'CA',
                        'city': 'Los Angeles',
                        'street': r.json()['result']['input']['address']['street'],
                        'benchmark': 'Public_AR_Current',
                        'format':'json'}
            )
            try:
                new_dict = format_data_from_api(r_redo)
            except IndexError:
                print('   still bad after ignoring ZIP...')
        if new_dict:
            resp_dicts.append(new_dict)
    else:
        print('bad status code')
df_from_api = pd.DataFrame(resp_dicts)

df_full = (
    df_from_api.merge(df_ellis, left_on=['in_street', 'in_zip'], right_on=['Address', 'Zip']).
    drop(['in_street', 'in_zip'], 1)
)

df_full.to_csv(os.path.join(DATA_DIR, OUT_FILE), index=False)

Traceback (most recent call last):
  File "/Users/mcmenamin/.virtualenvs/datakind_la/lib/python3.5/site-packages/urllib3/contrib/pyopenssl.py", line 441, in wrap_socket
    cnx.do_handshake()
  File "/Users/mcmenamin/.virtualenvs/datakind_la/lib/python3.5/site-packages/OpenSSL/SSL.py", line 1716, in do_handshake
    self._raise_ssl_error(self._ssl, result)
  File "/Users/mcmenamin/.virtualenvs/datakind_la/lib/python3.5/site-packages/OpenSSL/SSL.py", line 1431, in _raise_ssl_error
    raise WantReadError()
OpenSSL.SSL.WantReadError

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/mcmenamin/.virtualenvs/datakind_la/lib/python3.5/site-packages/gevent/greenlet.py", line 536, in run
    result = self._run(*self.args, **self.kwargs)
  File "/Users/mcmenamin/.virtualenvs/datakind_la/lib/python3.5/site-packages/grequests.py", line 72, in send
    self.url, **merged_kwargs)
  File "/Users/mcmenamin/.virtualenvs/datakind_la/l

KeyboardInterrupt: 

Traceback (most recent call last):
  File "/Users/mcmenamin/.virtualenvs/datakind_la/lib/python3.5/site-packages/urllib3/contrib/pyopenssl.py", line 441, in wrap_socket
    cnx.do_handshake()
  File "/Users/mcmenamin/.virtualenvs/datakind_la/lib/python3.5/site-packages/OpenSSL/SSL.py", line 1716, in do_handshake
    self._raise_ssl_error(self._ssl, result)
  File "/Users/mcmenamin/.virtualenvs/datakind_la/lib/python3.5/site-packages/OpenSSL/SSL.py", line 1431, in _raise_ssl_error
    raise WantReadError()
OpenSSL.SSL.WantReadError

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/mcmenamin/.virtualenvs/datakind_la/lib/python3.5/site-packages/gevent/greenlet.py", line 536, in run
    result = self._run(*self.args, **self.kwargs)
  File "/Users/mcmenamin/.virtualenvs/datakind_la/lib/python3.5/site-packages/grequests.py", line 72, in send
    self.url, **merged_kwargs)
  File "/Users/mcmenamin/.virtualenvs/datakind_la/l