In [24]:
import os
from functools import partial

import pandas as pd

from geopy.exc import GeocoderUnavailable
from geopy.geocoders import Nominatim
from ratelimit import limits, sleep_and_retry

In [25]:
DUMMY_LOCATION_STRS = ["Berlin, Deutschland",
"France",
"Paris et plein Centre",
"Punjab, Pakistan",
"South Africa",
"Boulder, Colorado",
"NONE",
"London",
"United States",
"Ethiopia",
"Ethiopia",
"TΧ",
"NONE",
"Paris, Mulhouse, sur la route",
"London",
"Barcelona",
"Liverpool",
"Bay Area, CA"]

In [28]:
@sleep_and_retry
@limits(calls=1, period=1)
def get_location(location_str, geolocator, **kwargs):
    try:
        return geolocator.geocode(location_str, **kwargs)
    except GeocoderUnavailable as e:
        return None

In [26]:
def read_locations(): #TODO: update to real data
    """
    reads the input data
    :return: a dataframe with a column `location_str` containing the description of the location
    """
    return pd.DataFrame(DUMMY_LOCATION_STRS, columns=['location_str'], index=range(len(DUMMY_LOCATION_STRS)))

In [31]:
@sleep_and_retry
@limits(calls=1, period=1)
def reverse_location(location_obj, geolocator, **kwargs):
    try:
        coords = ",".join((location_obj.raw['lat'], location_obj.raw['lon']))
    except AttributeError as e:
        return None
    try:
        return geolocator.reverse(coords, **kwargs)
    except GeocoderUnavailable as e:
        return None

In [27]:
# read the data
locations = read_locations()
# initialize the geolocator service
geolocator = Nominatim(user_agent="twitter_poll_bio_geocoding_v0.0.1")


In [29]:
# apply the geolocator
locations['location_obj'] = locations.location_str.apply(partial(get_location, geolocator=geolocator))

Not everything finds a match on Nominatim, esp. when there are natural language descriptions.

Perhaps we could give it a pass of NER for GEO entities first

In [30]:
locations[locations.location_obj.isna()]

Unnamed: 0,location_str,location_obj
2,Paris et plein Centre,
13,"Paris, Mulhouse, sur la route",


The information returned by the `geocode` function is quite limited, because it tries to fit the specific string.
For example, states are rarely included in the object. Reverse geocoding helps with generalizing in this case.

In [32]:
# apply the reverse geolocator
locations['location_obj_reversed'] = locations.location_obj.apply(partial(reverse_location, geolocator=geolocator))

In [33]:
# unpack address information
addresses = pd.DataFrame(locations.location_obj_reversed.dropna().apply(lambda x:pd.Series(x.raw['address'])))
addresses.head()

Unnamed: 0,railway,road,suburb,borough,city,ISO3166-2-lvl4,postcode,country,country_code,village,...,state,region,subdistrict,house_number,tourism,neighbourhood,ISO3166-2-lvl8,state_district,quarter,city_district
0,Unter den Linden,Unter den Linden,Mitte,Mitte,Berlin,DE-BE,10117.0,Deutschland,de,,...,,,,,,,,,,
1,,D 51,,,,FR-CVL,36230.0,France,fr,Tranzault,...,Centre-Val de Loire,France métropolitaine,,,,,,,,
3,,,,,,PK-PB,35090.0,پاکستان,pk,,...,پنجاب,,Ahmedpur Sial Tehsil,,,,,,,
4,,,Tokologo Ward 3,,Tokologo Local Municipality,ZA-FS,,South Africa,za,,...,Free State,,,,,,,,,
5,,Arapahoe Avenue,,,Boulder,US-CO,80306.0,United States,us,,...,Colorado,,,1843.0,,,,,,


In [34]:
# merge with the original dataframe
locations = pd.merge(locations, addresses, left_index=True, right_index=True)

In [35]:
locations[['location_str', 'country_code']]

Unnamed: 0,location_str,country_code
0,"Berlin, Deutschland",de
1,France,fr
3,"Punjab, Pakistan",pk
4,South Africa,za
5,"Boulder, Colorado",us
6,NONE,it
7,London,gb
8,United States,us
9,Ethiopia,et
10,Ethiopia,et


some errors appear bizarre -- hard to correct

In [36]:
locations.loc[11].location_obj.raw

{'place_id': 43010750,
 'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. https://osm.org/copyright',
 'osm_type': 'node',
 'osm_id': 3641278609,
 'boundingbox': ['13.7846492', '13.7946492', '100.4346142', '100.4446142'],
 'lat': '13.7896492',
 'lon': '100.4396142',
 'display_name': 'ชุมทางตลิ่งชัน, ทางพิเศษประจิมรัถยา, แขวงตลิ่งชัน, เขตตลิ่งชัน, กรุงเทพมหานคร, 10170, ประเทศไทย',
 'class': 'railway',
 'type': 'station',
 'importance': 0.3740116294168639,
 'icon': 'https://nominatim.openstreetmap.org/ui/mapicons/transport_train_station2.p.20.png'}

In [38]:
# save data
DATA_PATH = 'data'
os.makedirs(DATA_PATH, exist_ok=True)
locations[['location_str', 'country_code']].to_csv(os.path.join(DATA_PATH, 'bio_country_codes.csv'))
locations[['location_str', 'railway',
           'road', 'suburb', 'borough', 'city', 'ISO3166-2-lvl4', 'postcode',
           'country', 'country_code', 'village', 'municipality', 'county',
           'ISO3166-2-lvl6', 'state', 'region', 'subdistrict', 'house_number',
           'tourism', 'neighbourhood', 'ISO3166-2-lvl8', 'state_district',
           'quarter', 'city_district']].to_csv(os.path.join(DATA_PATH, 'bio_locations.csv'))
locations.to_pickle(os.path.join(DATA_PATH, 'bio_locations.pkl'))
