In [2]:
import requests
from typing import *
import pandas as pd
from ratelimiter import RateLimiter
import ast

# Standardize city names

Use the Google Maps API to standardize place names in our existing data.

In [3]:
 # 40 calls per second
@RateLimiter(max_calls=40, period=1) 
def get_city_data(city: str, state: str, api_key: str) -> Tuple[float, float]:
    city = city.replace(" ", "+")
    state = state.replace(" ", "+")
    response = requests.get(f"https://maps.googleapis.com/maps/api/geocode/json?address={city},{state}&key={api_key}")
    if response.status_code == 200:
        data = response.json()
        results = []
        if data['results']:
            for result in data['results']:
                row = (
                    result['geometry']['location']['lat'],
                    result['geometry']['location']['lng'],
                    result['address_components'],
                    result['formatted_address'],
                    result['place_id'],
                    result['types']
                )
                results.append(row)
            return results
    else:
        print("Request failed with status code:", response.status_code)
        print(response.text)
    return None, None

# Code REAC public inspection data

In [51]:
reac_13_18 = pd.read_csv('../data/processed/reac_13-18.csv')

In [52]:
unique_cities = [x.split(',') for x in reac_13_18.CITYSTATE.unique()]

In [53]:
unique_cities_chunk2 = unique_cities[4008:]

Make ~8000 calls to the Google Maps API. (Took ~25 minutes.)

In [54]:
# results = []
# for row in unique_cities:
#     results.append((row[0] + ',' + row[1], get_city_data(row[0], row[1], api_key)))

# results2 = []
# for row in unique_cities_chunk2:
#     results2.append((row[0] + ',' + row[1], get_city_data(row[0], row[1], api_key)))

Package the result into a dataframe.

In [55]:
# rows = []
# for result in results2:
#     if len(result[1]) == 1:
#         # print(list(result[0]) + list(result[1]))
#         rows.append([result[0]] + list(result[1][0]))
#     if len(result[1]) > 1:
#         if result[1][0] == None:
#             continue
#         for row in result[1]:
#             rows.append([result[0]] + list(row))


Give meaningful names to the columns.

In [56]:
# df = pd.DataFrame(rows)
# df = df.rename(columns={
#     0: 'citystate',
#     1: 'latitude',
#     2: 'longitude',
#     3: 'address_components',
#     4: 'formatted_address',
#     5: 'place_id',
#     6: 'types'
# })

# Process corrected REAC names

Read dfs back from csv.

In [57]:
chunk1 = pd.read_csv('reac_rename_chunk1.csv')
chunk2 = pd.read_csv('reac_rename_chunk2.csv')
reac = pd.concat([chunk1, chunk2], axis=0)

Pull out the city and state names from the dataset.

* For some reason, Google Maps Geocoding API fails whenever the city is named "Canton." We'll replace these values manually.
* For some reason, Puerto Rico is identified as a country

In [59]:
def get_locality(address_components: str) -> str:
    address_components = ast.literal_eval(address_components)
    for dictionary in address_components:
        if 'locality' in dictionary['types']:
            return dictionary['long_name']
        elif 'administrative_area_level_3' in dictionary['types']:
            return dictionary['long_name']

def get_state_code(address_components: str) -> str:
    address_components = ast.literal_eval(address_components)
    for dictionary in address_components:
        if 'administrative_area_level_1' in dictionary['types']:
            return dictionary['short_name']
        
def fix_puerto_rico(address_components: str, corrected_address: str) -> str:
    address_components = ast.literal_eval(address_components)
    is_puerto_rico = False
    for dictionary in address_components:
        if 'country' in dictionary['types'] and dictionary['short_name'] == 'PR':
            is_puerto_rico = True
    if is_puerto_rico:
        for dictionary in address_components:
            if 'administrative_area_level_1' in dictionary['types']:
                return dictionary['short_name'].upper() + ',PR'
    return corrected_address

def fix_cantons(citystate: str, corrected_address: str) -> str:
    if citystate.split(',')[0] == 'CANTON':
        return citystate
    return corrected_address

ny_fixes = {
    'STATEN ISLAND,NY': 'NEW YORK,NY',
    'BRONX,NY': 'NEW YORK,NY',
    'BROOKLYN,NY': 'NEW YORK,NY',
    'QUEENS,NY': 'NEW YORK,NY',
    'MANHATTAN,NY': 'NEW YORK,NY'
}

def fix_new_york(citystate: str, corrected_address: str) -> str:
    if citystate in ny_fixes:
        return ny_fixes[citystate]
    return corrected_address



reac['corrected_address'] = reac.address_components.apply(get_locality).str.upper() + \
                            ',' + \
                            reac.address_components.apply(get_state_code).str.upper()
reac['corrected_address'] = reac.apply(lambda x: fix_puerto_rico(x['address_components'], x['corrected_address']), axis=1)
reac['corrected_address'] = reac.apply(lambda x: fix_cantons(x['citystate'], x['corrected_address']), axis=1)
reac['corrected_address'] = reac.apply(lambda x: fix_new_york(x['citystate'], x['corrected_address']), axis=1)


We have some rows which returned a lowest level of granularity different from the 'locality' level. We'll have to deal with each of these separately.

In [60]:
reac.groupby(by='types').size().sort_values(ascending=False)

types
['locality', 'political']                                                      8140
['administrative_area_level_3', 'political']                                    262
['neighborhood', 'political']                                                   251
['administrative_area_level_1', 'political']                                     50
['administrative_area_level_2', 'political']                                     34
['establishment', 'natural_feature']                                             13
['political', 'sublocality', 'sublocality_level_1']                               9
['colloquial_area', 'political']                                                  7
['political']                                                                     6
['route']                                                                         4
['country', 'political']                                                          2
['establishment', 'park', 'point_of_interest', 'tourist_attraction']  

In [68]:
reac_fixed = reac[reac.corrected_address.notna()]

Were fixed:
* ['neighborhood', 'political'] is standardized already, because we also have the locality key
* ['administrative_area_level_3', 'political'] contains valid town/village names
* ['political', 'sublocality', 'sublocality_level_1'] were handled individually
* ['political', 'sublocality', 'sublocality_level_1'] was fixed manually

In [137]:
reac_fixed.to_csv('reac_fixed.csv', index=False, sep=',')

# Code NFIRS data

In [70]:
nfirs = pd.read_csv('../data/processed/other_nfirs_13_18.csv')

Strip off blank characters.

In [92]:
nfirs['CITYSTATE'] = nfirs.CITYSTATE.apply(
    lambda x: x.strip()
)

In [101]:
nfirs_sub = nfirs[
    (~nfirs.CITYSTATE.isin(reac_fixed.citystate)) &
    (~nfirs.CITYSTATE.isin(reac_fixed.corrected_address)) &
    (nfirs.SUPPORT > 100)
]

In [108]:
nfirs_locs = nfirs_sub.CITYSTATE.unique()
nfirs_split = []
for loc in nfirs_locs:
    nfirs_split.append(loc.split(','))

In [114]:
# results3 = []
# for row in nfirs_split:
#     results3.append((row[0] + ',' + row[1], get_city_data(row[0], row[1], api_key)))


In [117]:
# rows3 = []
# for result in results3:
#     if len(result[1]) == 1:
#         # print(list(result[0]) + list(result[1]))
#         rows3.append([result[0]] + list(result[1][0]))
#     if len(result[1]) > 1:
#         if result[1][0] == None:
#             continue
#         for row in result[1]:
#             rows3.append([result[0]] + list(row))

In [120]:
# nfirs_api = pd.DataFrame(rows3)
# nfirs_api = nfirs_api.rename(columns={
#     0: 'citystate',
#     1: 'latitude',
#     2: 'longitude',
#     3: 'address_components',
#     4: 'formatted_address',
#     5: 'place_id',
#     6: 'types'
# })

In [122]:
# nfirs_api.to_csv('nfirs_api.csv', index=False, sep=',')

In [129]:
nfirs_api = pd.read_csv('nfirs_api.csv', sep=',')

In [131]:
def get_locality(address_components: str) -> str:
    address_components = ast.literal_eval(address_components)
    for dictionary in address_components:
        if 'locality' in dictionary['types']:
            return dictionary['long_name']
        elif 'administrative_area_level_3' in dictionary['types']:
            return dictionary['long_name']

def get_state_code(address_components: str) -> str:
    address_components = ast.literal_eval(address_components)
    for dictionary in address_components:
        if 'administrative_area_level_1' in dictionary['types']:
            return dictionary['short_name']
        
def fix_puerto_rico(address_components: str, corrected_address: str) -> str:
    address_components = ast.literal_eval(address_components)
    is_puerto_rico = False
    for dictionary in address_components:
        if 'country' in dictionary['types'] and dictionary['short_name'] == 'PR':
            is_puerto_rico = True
    if is_puerto_rico:
        for dictionary in address_components:
            if 'administrative_area_level_1' in dictionary['types']:
                return dictionary['short_name'].upper() + ',PR'
    return corrected_address

def fix_cantons(citystate: str, corrected_address: str) -> str:
    if citystate.split(',')[0] == 'CANTON':
        return citystate
    return corrected_address

ny_fixes = {
    'STATEN ISLAND,NY': 'NEW YORK,NY',
    'BRONX,NY': 'NEW YORK,NY',
    'BROOKLYN,NY': 'NEW YORK,NY',
    'QUEENS,NY': 'NEW YORK,NY',
    'MANHATTAN,NY': 'NEW YORK,NY'
}

def fix_new_york(citystate: str, corrected_address: str) -> str:
    if citystate in ny_fixes:
        return ny_fixes[citystate]
    return corrected_address


nfirs_api['corrected_address'] = nfirs_api.address_components.apply(get_locality).str.upper() + \
                                ',' + \
                                nfirs_api.address_components.apply(get_state_code).str.upper()
nfirs_api['corrected_address'] = nfirs_api.apply(lambda x: fix_puerto_rico(x['address_components'], x['corrected_address']), axis=1)
nfirs_api['corrected_address'] = nfirs_api.apply(lambda x: fix_cantons(x['citystate'], x['corrected_address']), axis=1)
nfirs_api['corrected_address'] = nfirs_api.apply(lambda x: fix_new_york(x['citystate'], x['corrected_address']), axis=1)


In [135]:
nfirs_api.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10967 entries, 0 to 10966
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   citystate           10967 non-null  object 
 1   latitude            10967 non-null  float64
 2   longitude           10967 non-null  float64
 3   address_components  10967 non-null  object 
 4   formatted_address   10967 non-null  object 
 5   place_id            10967 non-null  object 
 6   types               10967 non-null  object 
 7   corrected_address   10476 non-null  object 
dtypes: float64(2), object(6)
memory usage: 685.6+ KB


In [138]:
nfirs_api.to_csv('nfirs_fixed.csv', sep=',', index=False)

# Code NSPIRE data

In [14]:
nspire = pd.read_csv('../data/nspire_demo_deficiencies.csv', low_memory=False)

In [15]:
nspire = nspire[nspire['Shipping City'].notna()]
nspire = nspire[nspire['Shipping State/Province'].notna()]

In [16]:
nspire['Shipping City'] = nspire['Shipping City'].apply(lambda x: x.strip())
nspire['Shipping State/Province'] = nspire['Shipping State/Province'].apply(lambda x: x.strip())

In [18]:
nspire['citystate'] = nspire['Shipping City'].str.upper() + ',' + nspire['Shipping State/Province'].str.upper()

In [21]:
nspire_locs = nspire.citystate.unique()
nspire_split = []
for loc in nspire_locs:
    nspire_split.append(loc.split(','))

In [26]:
# results4 = []
# for row in nspire_split:
#     results4.append((row[0] + ',' + row[1], get_city_data(row[0], row[1], api_key)))

In [25]:
# rows4 = []
# for result in results4:
#     if len(result[1]) == 1:
#         # print(list(result[0]) + list(result[1]))
#         rows4.append([result[0]] + list(result[1][0]))
#     if len(result[1]) > 1:
#         if result[1][0] == None:
#             continue
#         for row in result[1]:
#             rows4.append([result[0]] + list(row))

In [28]:
# nspire_api = pd.DataFrame(rows4)
# nspire_api = nspire_api.rename(columns={
#     0: 'citystate',
#     1: 'latitude',
#     2: 'longitude',
#     3: 'address_components',
#     4: 'formatted_address',
#     5: 'place_id',
#     6: 'types'
# })

In [32]:
nspire_api = pd.read_csv('test_nspire_api.csv', sep=',')

In [33]:
def get_locality(address_components: str) -> str:
    address_components = ast.literal_eval(address_components)
    for dictionary in address_components:
        if 'locality' in dictionary['types']:
            return dictionary['long_name']
        elif 'administrative_area_level_3' in dictionary['types']:
            return dictionary['long_name']

def get_state_code(address_components: str) -> str:
    address_components = ast.literal_eval(address_components)
    for dictionary in address_components:
        if 'administrative_area_level_1' in dictionary['types']:
            return dictionary['short_name']
        
def fix_puerto_rico(address_components: str, corrected_address: str) -> str:
    address_components = ast.literal_eval(address_components)
    is_puerto_rico = False
    for dictionary in address_components:
        if 'country' in dictionary['types'] and dictionary['short_name'] == 'PR':
            is_puerto_rico = True
    if is_puerto_rico:
        for dictionary in address_components:
            if 'administrative_area_level_1' in dictionary['types']:
                return dictionary['short_name'].upper() + ',PR'
    return corrected_address

def fix_cantons(citystate: str, corrected_address: str) -> str:
    if citystate.split(',')[0] == 'CANTON':
        return citystate
    return corrected_address

ny_fixes = {
    'STATEN ISLAND,NY': 'NEW YORK,NY',
    'BRONX,NY': 'NEW YORK,NY',
    'BROOKLYN,NY': 'NEW YORK,NY',
    'QUEENS,NY': 'NEW YORK,NY',
    'MANHATTAN,NY': 'NEW YORK,NY'
}

def fix_new_york(citystate: str, corrected_address: str) -> str:
    if citystate in ny_fixes:
        return ny_fixes[citystate]
    return corrected_address


nspire_api['corrected_address'] = nspire_api.address_components.apply(get_locality).str.upper() + \
                                ',' + \
                                nspire_api.address_components.apply(get_state_code).str.upper()
nspire_api['corrected_address'] = nspire_api.apply(lambda x: fix_puerto_rico(x['address_components'], x['corrected_address']), axis=1)
nspire_api['corrected_address'] = nspire_api.apply(lambda x: fix_cantons(x['citystate'], x['corrected_address']), axis=1)
nspire_api['corrected_address'] = nspire_api.apply(lambda x: fix_new_york(x['citystate'], x['corrected_address']), axis=1)

In [38]:
nspire_api.to_csv('nspire_fixed.csv', sep=',', index=False)