In [44]:
import json
import os
from functools import partial

import pandas as pd
from backoff import on_exception, expo
from geopy.exc import GeocoderUnavailable, GeocoderTimedOut
from geopy.geocoders import Nominatim
from ratelimit import limits, sleep_and_retry
from tqdm import tqdm

In [3]:
DUMMY_LOCATION_STRS = ["Berlin, Deutschland",
"France",
"Paris et plein Centre",
"Punjab, Pakistan",
"South Africa",
"Boulder, Colorado",
"NONE",
"London",
"United States",
"Ethiopia",
"Ethiopia",
"TŒß",
"NONE",
"Paris, Mulhouse, sur la route",
"London",
"Barcelona",
"Liverpool",
"Bay Area, CA"]

In [4]:
DATA_PATH = 'data'

In [40]:
@sleep_and_retry
@on_exception(expo, GeocoderTimedOut, max_tries=8)
@limits(calls=1, period=1)
def get_location(location_str, geolocator, **kwargs):
    try:
        return geolocator.geocode(location_str, **kwargs)
    except GeocoderUnavailable as e:
        return None

In [20]:
@sleep_and_retry
@on_exception(expo, GeocoderTimedOut, max_tries=8)
@limits(calls=1, period=1)
def reverse_location(location_obj, geolocator, **kwargs):
    try:
        coords = ",".join((location_obj.raw['lat'], location_obj.raw['lon']))
    except AttributeError as e:
        return None
    try:
        return geolocator.reverse(coords, **kwargs)
    except GeocoderUnavailable as e:
        return None

In [25]:
def read_locations(users_path=None):
    """
    reads the input data
    :return: a dataframe with a column `location_str` containing the description of the location
    """
    if users_path is None:
        return pd.DataFrame(DUMMY_LOCATION_STRS, columns=['location_str'], index=range(len(DUMMY_LOCATION_STRS)))
    else:
        return pd.read_json(os.path.join(DATA_PATH, 'followers_2016_rehydrated.jsonl'),
                            lines=True
                            ).rename(columns={'location':'location_str'}
                                     ).set_index('id').dropna(subset=['location_str'])

        # with open(users_path, encoding='utf8') as f:
        #     # users = {k:v for l in f for k, v in json.loads(l).items()}
        #
        #     users = pd.concat(
        #         pd.DataFrame([dict(follower, **{'follows':pollster}) for follower in followers])
        #         for l in f for pollster, followers in json.loads(l).items()
        #     )
        # return users


In [26]:
users = read_locations(os.path.join(DATA_PATH, 'followers_2016_rehydrated.jsonl'))

In [27]:
users

Unnamed: 0_level_0,profile_image_url,username,public_metrics,description,protected,created_at,verified,name,pinned_tweet_id,entities,location_str,url,withheld
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1599787232,https://pbs.twimg.com/profile_images/966765484...,OGUpcoin,"{'followers_count': 2121, 'following_count': 2...",I am the Co-Founder of $UPCOIN the #crypto cur...,False,2013-07-17 02:14:15+00:00,False,Glory2GloryStudio üïäüá∫üá∏üá∫üá¶üïä,1.426972e+18,"{'url': {'urls': [{'start': 0, 'end': 23, 'url...",United States,https://t.co/0jI9gEozYA,
1549733678233837568,https://pbs.twimg.com/profile_images/155014899...,hoffpavir_evely,"{'followers_count': 150, 'following_count': 30...",,False,2022-07-20 12:31:24+00:00,False,Evelyn,,"{'url': {'urls': [{'start': 0, 'end': 23, 'url...",Bridgeport,https://t.co/iZAMWhQrsq,
1586767903973511168,https://pbs.twimg.com/profile_images/158859749...,richards_moniqu,"{'followers_count': 117, 'following_count': 13...",,False,2022-10-30 17:12:46+00:00,False,Monique,,"{'url': {'urls': [{'start': 0, 'end': 23, 'url...",Murrieta,https://t.co/UEeJUFUrCW,
1578080496906641408,https://pbs.twimg.com/profile_images/158101944...,angela13poole,"{'followers_count': 166, 'following_count': 18...",,False,2022-10-06 17:51:41+00:00,False,Angela,,"{'url': {'urls': [{'start': 0, 'end': 23, 'url...",Thornton,https://t.co/AlsYzn4XD2,
1550149641823469568,https://pbs.twimg.com/profile_images/155014977...,giantstocks_,"{'followers_count': 226, 'following_count': 29...",Investor & Trader. My tweets are all IMHO & ma...,False,2022-07-21 16:04:16+00:00,False,Trader Joe,,"{'description': {'hashtags': [{'start': 75, 'e...",United States,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3941300294,https://pbs.twimg.com/profile_images/832659481...,RigHandSpirits,"{'followers_count': 3804, 'following_count': 4...","Pure Alberta Spirits! Vodka, rum, whiskey, moo...",False,2015-10-19 00:19:13+00:00,False,Rig Hand Distillery,,"{'url': {'urls': [{'start': 0, 'end': 23, 'url...",Nisku AB,https://t.co/n63s3lApnK,
824443214937497600,https://pbs.twimg.com/profile_images/857030758...,BLISSMagAlberta,"{'followers_count': 462, 'following_count': 13...",Attain total BLISS - articles featuring #beer ...,False,2017-01-26 02:25:54+00:00,False,BLISS Magazine,8.877268e+17,"{'url': {'urls': [{'start': 0, 'end': 23, 'url...","Alberta, Canada",https://t.co/Xfthprl0Ky,
857648235585740800,https://pbs.twimg.com/profile_images/857651838...,Mizzboogz,"{'followers_count': 29, 'following_count': 67,...","Passionate about life and love , lgbt and prou...",False,2017-04-27 17:30:48+00:00,False,Ashley Salazar,,"{'description': {'mentions': [{'start': 80, 'e...","High River, Alberta",,
2767912537,https://pbs.twimg.com/profile_images/781211374...,BigAlsSmoke,"{'followers_count': 23717, 'following_count': ...",Big Al's Smoke & Gifts is the #1 premier smoke...,False,2014-08-25 20:56:23+00:00,False,Big Al's Smoke,,"{'url': {'urls': [{'start': 0, 'end': 23, 'url...","Berkeley, CA",https://t.co/Xeb2pE1Luq,


In [28]:
# read the data
locations = read_locations(os.path.join(DATA_PATH, 'followers_2016_rehydrated.jsonl'))
# initialize the geolocator service
geolocator = Nominatim(user_agent="twitter_poll_bio_geocoding_v0.0.1")


In [34]:
unique_locations = set(locations.location_str.unique())

In [41]:
resolved = dict()
tested = set()

In [None]:
for location in tqdm(unique_locations):
    if location in tested:
        continue
    resolved[location] = get_location(location, geolocator)
    tested.add(location)

  1%|          | 136/11456 [01:00<1:33:14,  2.02it/s]

In [29]:
# # apply the geolocator
# locations['location_obj'] = locations.location_str.apply(partial(get_location, geolocator=geolocator))

GeocoderTimedOut: Service timed out

Not everything finds a match on Nominatim, esp. when there are natural language descriptions.

Perhaps we could give it a pass of NER for GEO entities first

In [None]:
locations[locations.location_obj.isna()]

The information returned by the `geocode` function is quite limited, because it tries to fit the specific string.
For example, states are rarely included in the object. Reverse geocoding helps with generalizing in this case.

In [32]:
# apply the reverse geolocator
locations['location_obj_reversed'] = locations.location_obj.apply(partial(reverse_location, geolocator=geolocator))

In [33]:
# unpack address information
addresses = pd.DataFrame(locations.location_obj_reversed.dropna().apply(lambda x:pd.Series(x.raw['address'])))
addresses.head()

Unnamed: 0,railway,road,suburb,borough,city,ISO3166-2-lvl4,postcode,country,country_code,village,...,state,region,subdistrict,house_number,tourism,neighbourhood,ISO3166-2-lvl8,state_district,quarter,city_district
0,Unter den Linden,Unter den Linden,Mitte,Mitte,Berlin,DE-BE,10117.0,Deutschland,de,,...,,,,,,,,,,
1,,D 51,,,,FR-CVL,36230.0,France,fr,Tranzault,...,Centre-Val de Loire,France m√©tropolitaine,,,,,,,,
3,,,,,,PK-PB,35090.0,Ÿæÿß⁄©ÿ≥ÿ™ÿßŸÜ,pk,,...,ŸæŸÜÿ¨ÿßÿ®,,Ahmedpur Sial Tehsil,,,,,,,
4,,,Tokologo Ward 3,,Tokologo Local Municipality,ZA-FS,,South Africa,za,,...,Free State,,,,,,,,,
5,,Arapahoe Avenue,,,Boulder,US-CO,80306.0,United States,us,,...,Colorado,,,1843.0,,,,,,


In [34]:
# merge with the original dataframe
locations = pd.merge(locations, addresses, left_index=True, right_index=True)

In [35]:
locations[['location_str', 'country_code']]

Unnamed: 0,location_str,country_code
0,"Berlin, Deutschland",de
1,France,fr
3,"Punjab, Pakistan",pk
4,South Africa,za
5,"Boulder, Colorado",us
6,NONE,it
7,London,gb
8,United States,us
9,Ethiopia,et
10,Ethiopia,et


some errors appear bizarre -- hard to correct

In [36]:
locations.loc[11].location_obj.raw

{'place_id': 43010750,
 'licence': 'Data ¬© OpenStreetMap contributors, ODbL 1.0. https://osm.org/copyright',
 'osm_type': 'node',
 'osm_id': 3641278609,
 'boundingbox': ['13.7846492', '13.7946492', '100.4346142', '100.4446142'],
 'lat': '13.7896492',
 'lon': '100.4396142',
 'display_name': '‡∏ä‡∏∏‡∏°‡∏ó‡∏≤‡∏á‡∏ï‡∏•‡∏¥‡πà‡∏á‡∏ä‡∏±‡∏ô, ‡∏ó‡∏≤‡∏á‡∏û‡∏¥‡πÄ‡∏®‡∏©‡∏õ‡∏£‡∏∞‡∏à‡∏¥‡∏°‡∏£‡∏±‡∏ñ‡∏¢‡∏≤, ‡πÅ‡∏Ç‡∏ß‡∏á‡∏ï‡∏•‡∏¥‡πà‡∏á‡∏ä‡∏±‡∏ô, ‡πÄ‡∏Ç‡∏ï‡∏ï‡∏•‡∏¥‡πà‡∏á‡∏ä‡∏±‡∏ô, ‡∏Å‡∏£‡∏∏‡∏á‡πÄ‡∏ó‡∏û‡∏°‡∏´‡∏≤‡∏ô‡∏Ñ‡∏£, 10170, ‡∏õ‡∏£‡∏∞‡πÄ‡∏ó‡∏®‡πÑ‡∏ó‡∏¢',
 'class': 'railway',
 'type': 'station',
 'importance': 0.3740116294168639,
 'icon': 'https://nominatim.openstreetmap.org/ui/mapicons/transport_train_station2.p.20.png'}

In [38]:
# save data
os.makedirs(DATA_PATH, exist_ok=True)
locations[['location_str', 'country_code']].to_csv(os.path.join(DATA_PATH, 'bio_country_codes.csv'))
locations[['location_str', 'railway',
           'road', 'suburb', 'borough', 'city', 'ISO3166-2-lvl4', 'postcode',
           'country', 'country_code', 'village', 'municipality', 'county',
           'ISO3166-2-lvl6', 'state', 'region', 'subdistrict', 'house_number',
           'tourism', 'neighbourhood', 'ISO3166-2-lvl8', 'state_district',
           'quarter', 'city_district']].to_csv(os.path.join(DATA_PATH, 'bio_locations.csv'))
locations.to_pickle(os.path.join(DATA_PATH, 'bio_locations.pkl'))
