TL;DR: this notebook loads the town state csv and queries google maps api using a gcp api key
in order to retrieve location for all the addresses
the adresses are created by conctenating the town and the state
since the requests are paid, the result is stored in the town_state_geocoded_google csv

# geocode town states

In [1]:
!ls ../data/csv

cliente_tabla.csv         sample_submission_10k.csv town_state.csv
producto_tabla.csv        test.csv                  train.csv
sample_submission.csv     test_10k.csv              train_100k.csv


In [2]:
import pandas as pd

df = pd.read_csv('../data/csv/town_state.csv')
df.shape

(790, 3)

In [3]:
df[:3]

Unnamed: 0,Agencia_ID,Town,State
0,1110,2008 AG. LAGO FILT,"MÉXICO, D.F."
1,1111,2002 AG. AZCAPOTZALCO,"MÉXICO, D.F."
2,1112,2004 AG. CUAUTITLAN,ESTADO DE MÉXICO


In [4]:
df['Location'] = df.Town + ', ' + df.State
df[:3]

Unnamed: 0,Agencia_ID,Town,State,Location
0,1110,2008 AG. LAGO FILT,"MÉXICO, D.F.","2008 AG. LAGO FILT, MÉXICO, D.F."
1,1111,2002 AG. AZCAPOTZALCO,"MÉXICO, D.F.","2002 AG. AZCAPOTZALCO, MÉXICO, D.F."
2,1112,2004 AG. CUAUTITLAN,ESTADO DE MÉXICO,"2004 AG. CUAUTITLAN, ESTADO DE MÉXICO"


## open street map nominatim does NOT work
the api returns timeouts even with 10 seconds between requests

In [None]:
from geopy.geocoders import Nominatim
locator = Nominatim(user_agent='myGeocoder')

In [None]:
location = locator.geocode('Champ de Mars, Paris, France')

In [None]:
print(f'lat {location.latitude}, lon {location.longitude}')

In [None]:
from geopy.extra.rate_limiter import RateLimiter
geocode = RateLimiter(locator.geocode, min_delay_seconds=10, error_wait_seconds=10)

In [None]:
df['loc'] = df['Location'].apply(geocode)

In [None]:
df['point'] = df['loc'].apply(lambda loc: tuple(loc.point) if loc else None)

In [None]:
df[['latitude', 'longitude', 'altitude']] = pd.DataFrame(df['point'].tolist(), index=df.index)

## google maps geocoding WORKS

In [5]:
!ls ..

README.md           [1m[36mdata[m[m                [1m[36mmodels[m[m              [1m[36mserialize-models[m[m
[1m[36m__pycache__[m[m         env.py              [1m[36mnotebooks[m[m           [1m[36mshared_notebooks[m[m
[1m[36mbimbo[m[m               model.joblib        [1m[36mpresentation_images[m[m


In [6]:
# env.py should contain a line with google maps api key
# GOOGLE_MAPS_API_KEY='your key here'
# and should not be stored in git
from env import GOOGLE_MAPS_API_KEY

In [7]:
import googlemaps
from datetime import datetime

gmaps = googlemaps.Client(key=GOOGLE_MAPS_API_KEY)

In [8]:
geocode_result = gmaps.geocode('1600 Amphitheatre Parkway, Mountain View, CA')

In [9]:
geocode_result

[{'address_components': [{'long_name': '1600',
    'short_name': '1600',
    'types': ['street_number']},
   {'long_name': 'Amphitheatre Parkway',
    'short_name': 'Amphitheatre Pkwy',
    'types': ['route']},
   {'long_name': 'Mountain View',
    'short_name': 'Mountain View',
    'types': ['locality', 'political']},
   {'long_name': 'Santa Clara County',
    'short_name': 'Santa Clara County',
    'types': ['administrative_area_level_2', 'political']},
   {'long_name': 'California',
    'short_name': 'CA',
    'types': ['administrative_area_level_1', 'political']},
   {'long_name': 'United States',
    'short_name': 'US',
    'types': ['country', 'political']},
   {'long_name': '94043', 'short_name': '94043', 'types': ['postal_code']}],
  'formatted_address': '1600 Amphitheatre Pkwy, Mountain View, CA 94043, USA',
  'geometry': {'location': {'lat': 37.4223098, 'lng': -122.0846239},
   'location_type': 'ROOFTOP',
   'viewport': {'northeast': {'lat': 37.4236587802915,
     'lng': -122

In [10]:
gmaps.geocode('2008 AG. LAGO FILT, MÉXICO, D.F.')

[{'address_components': [{'long_name': 'Calle Lago Filt',
    'short_name': 'Calle Lago Filt',
    'types': ['route']},
   {'long_name': 'Ciudad de México',
    'short_name': 'México D.F.',
    'types': ['locality', 'political']},
   {'long_name': 'Ciudad de México',
    'short_name': 'CDMX',
    'types': ['administrative_area_level_1', 'political']},
   {'long_name': 'Mexico',
    'short_name': 'MX',
    'types': ['country', 'political']}],
  'formatted_address': 'Calle Lago Filt, Ciudad de México, CDMX, Mexico',
  'geometry': {'bounds': {'northeast': {'lat': 19.4430976,
     'lng': -99.19578930000002},
    'southwest': {'lat': 19.4408317, 'lng': -99.1959624}},
   'location': {'lat': 19.4419322, 'lng': -99.1957965},
   'location_type': 'GEOMETRIC_CENTER',
   'viewport': {'northeast': {'lat': 19.4433136302915,
     'lng': -99.19452686970851},
    'southwest': {'lat': 19.4406156697085, 'lng': -99.19722483029153}}},
  'partial_match': True,
  'place_id': 'ChIJl_Nv-wUC0oUR3hPQGGVRnUU',
  

In [24]:
# api rate limit
import time
for a, i in enumerate(list(range(3))):
    time.sleep(1)
    print(f'coucou {i}')

coucou 0
coucou 1
coucou 2


In [40]:
def google_geocode(location):
    time.sleep(.1)
    res = gmaps.geocode(location)

    route = None
    locality = None
    area = None
    country = None
    address = None
    lat = None
    lng = None
    bounds_ne_lat = None
    bounds_ne_lng = None
    bounds_sw_lat = None
    bounds_sw_lng = None
    viewport_ne_lat = None
    viewport_ne_lng = None
    viewport_sw_lat = None
    viewport_sw_lng = None

    if len(res) > 0:
        first_res = res[0]

        if 'address_components' in first_res:
            address_components = first_res['address_components']

            alen = len(address_components)

            if alen > 0:
                route = address_components[0]['long_name']

            if alen > 1:
                locality = address_components[1]['long_name']

            if alen > 2:
                area = address_components[2]['long_name']

            if alen > 3:
                country = address_components[3]['long_name']

        if 'formatted_address' in first_res:
            address = first_res['formatted_address']

        if 'geometry' in first_res:
            geometry = first_res['geometry']

            if 'location' in geometry:
                location = geometry['location']

                lat = location['lat']
                lng = location['lng']

            if 'bounds' in geometry:
                bounds = geometry['bounds']

                bounds_ne_lat = bounds['northeast']['lat']
                bounds_ne_lng = bounds['northeast']['lng']
                bounds_sw_lat = bounds['southwest']['lat']
                bounds_sw_lng = bounds['southwest']['lng']

            if 'viewport' in geometry:
                viewport = geometry['viewport']

                viewport_ne_lat = viewport['northeast']['lat']
                viewport_ne_lng = viewport['northeast']['lng']
                viewport_sw_lat = viewport['southwest']['lat']
                viewport_sw_lng = viewport['southwest']['lng']

    return pd.Series({
        'route': route,
        'locality': locality,
        'area': area,
        'country': country,
        'address': address,
        'lat': lat,
        'lng': lng,
        'bounds_ne_lat': bounds_ne_lat,
        'bounds_ne_lng': bounds_ne_lng,
        'bounds_sw_lat': bounds_sw_lat,
        'bounds_sw_lng': bounds_sw_lng,
        'viewport_ne_lat': viewport_ne_lat,
        'viewport_ne_lng': viewport_ne_lng,
        'viewport_sw_lat': viewport_sw_lat,
        'viewport_sw_lng': viewport_sw_lng
    })

In [42]:
# apply geocoding to each row in the dataset
start = time.time()
full_df = df.join(df.Location.apply(google_geocode))
stop = time.time()
print(f'{stop - start} seconds')
full_df[:3]

251.66800689697266 seconds


Unnamed: 0,Agencia_ID,Town,State,Location,route,locality,area,country,address,lat,lng,bounds_ne_lat,bounds_ne_lng,bounds_sw_lat,bounds_sw_lng,viewport_ne_lat,viewport_ne_lng,viewport_sw_lat,viewport_sw_lng
0,1110,2008 AG. LAGO FILT,"MÉXICO, D.F.","2008 AG. LAGO FILT, MÉXICO, D.F.",Calle Lago Filt,Ciudad de México,Ciudad de México,Mexico,"Calle Lago Filt, Ciudad de México, CDMX, Mexico",19.441932,-99.195797,19.443098,-99.195789,19.440832,-99.195962,19.443314,-99.194527,19.440616,-99.197225
1,1111,2002 AG. AZCAPOTZALCO,"MÉXICO, D.F.","2002 AG. AZCAPOTZALCO, MÉXICO, D.F.",Azcapotzalco,Mexico City,Mexico City,Mexico,"Azcapotzalco, Mexico City, CDMX, Mexico",19.484661,-99.188675,19.515136,-99.143055,19.456435,-99.221115,19.515136,-99.143055,19.456435,-99.221115
2,1112,2004 AG. CUAUTITLAN,ESTADO DE MÉXICO,"2004 AG. CUAUTITLAN, ESTADO DE MÉXICO",Cuautitlan,State of Mexico,Mexico,,"Cuautitlan, State of Mexico, Mexico",19.672659,-99.164869,19.693511,-99.139763,19.643411,-99.192603,19.693511,-99.139763,19.643411,-99.192603


In [44]:
full_df.to_csv('../data/csv/town_state_geocoded_google.csv')