In [50]:
import json
import os
from functools import partial

import pandas as pd
from backoff import on_exception, expo
from geopy.exc import GeocoderUnavailable, GeocoderTimedOut
from geopy.geocoders import Nominatim
from ratelimit import limits, sleep_and_retry
from tqdm import tqdm
import pickle

In [3]:
DUMMY_LOCATION_STRS = ["Berlin, Deutschland",
"France",
"Paris et plein Centre",
"Punjab, Pakistan",
"South Africa",
"Boulder, Colorado",
"NONE",
"London",
"United States",
"Ethiopia",
"Ethiopia",
"TΧ",
"NONE",
"Paris, Mulhouse, sur la route",
"London",
"Barcelona",
"Liverpool",
"Bay Area, CA"]

In [4]:
DATA_PATH = 'data'

In [40]:
@sleep_and_retry
@on_exception(expo, GeocoderTimedOut, max_tries=8)
@limits(calls=1, period=1)
def get_location(location_str, geolocator, **kwargs):
    try:
        return geolocator.geocode(location_str, **kwargs)
    except GeocoderUnavailable as e:
        return None

In [20]:
@sleep_and_retry
@on_exception(expo, GeocoderTimedOut, max_tries=8)
@limits(calls=1, period=1)
def reverse_location(location_obj, geolocator, **kwargs):
    try:
        coords = ",".join((location_obj.raw['lat'], location_obj.raw['lon']))
    except AttributeError as e:
        return None
    try:
        return geolocator.reverse(coords, **kwargs)
    except GeocoderUnavailable as e:
        return None

In [25]:
def read_locations(users_path=None):
    """
    reads the input data
    :return: a dataframe with a column `location_str` containing the description of the location
    """
    if users_path is None:
        return pd.DataFrame(DUMMY_LOCATION_STRS, columns=['location_str'], index=range(len(DUMMY_LOCATION_STRS)))
    else:
        return pd.read_json(os.path.join(DATA_PATH, 'followers_2016_rehydrated.jsonl'),
                            lines=True
                            ).rename(columns={'location':'location_str'}
                                     ).set_index('id').dropna(subset=['location_str'])

        # with open(users_path, encoding='utf8') as f:
        #     # users = {k:v for l in f for k, v in json.loads(l).items()}
        #
        #     users = pd.concat(
        #         pd.DataFrame([dict(follower, **{'follows':pollster}) for follower in followers])
        #         for l in f for pollster, followers in json.loads(l).items()
        #     )
        # return users


In [26]:
users = read_locations(os.path.join(DATA_PATH, 'followers_2016_rehydrated.jsonl'))

In [27]:
users

Unnamed: 0_level_0,profile_image_url,username,public_metrics,description,protected,created_at,verified,name,pinned_tweet_id,entities,location_str,url,withheld
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1599787232,https://pbs.twimg.com/profile_images/966765484...,OGUpcoin,"{'followers_count': 2121, 'following_count': 2...",I am the Co-Founder of $UPCOIN the #crypto cur...,False,2013-07-17 02:14:15+00:00,False,Glory2GloryStudio 🕊🇺🇸🇺🇦🕊,1.426972e+18,"{'url': {'urls': [{'start': 0, 'end': 23, 'url...",United States,https://t.co/0jI9gEozYA,
1549733678233837568,https://pbs.twimg.com/profile_images/155014899...,hoffpavir_evely,"{'followers_count': 150, 'following_count': 30...",,False,2022-07-20 12:31:24+00:00,False,Evelyn,,"{'url': {'urls': [{'start': 0, 'end': 23, 'url...",Bridgeport,https://t.co/iZAMWhQrsq,
1586767903973511168,https://pbs.twimg.com/profile_images/158859749...,richards_moniqu,"{'followers_count': 117, 'following_count': 13...",,False,2022-10-30 17:12:46+00:00,False,Monique,,"{'url': {'urls': [{'start': 0, 'end': 23, 'url...",Murrieta,https://t.co/UEeJUFUrCW,
1578080496906641408,https://pbs.twimg.com/profile_images/158101944...,angela13poole,"{'followers_count': 166, 'following_count': 18...",,False,2022-10-06 17:51:41+00:00,False,Angela,,"{'url': {'urls': [{'start': 0, 'end': 23, 'url...",Thornton,https://t.co/AlsYzn4XD2,
1550149641823469568,https://pbs.twimg.com/profile_images/155014977...,giantstocks_,"{'followers_count': 226, 'following_count': 29...",Investor & Trader. My tweets are all IMHO & ma...,False,2022-07-21 16:04:16+00:00,False,Trader Joe,,"{'description': {'hashtags': [{'start': 75, 'e...",United States,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3941300294,https://pbs.twimg.com/profile_images/832659481...,RigHandSpirits,"{'followers_count': 3804, 'following_count': 4...","Pure Alberta Spirits! Vodka, rum, whiskey, moo...",False,2015-10-19 00:19:13+00:00,False,Rig Hand Distillery,,"{'url': {'urls': [{'start': 0, 'end': 23, 'url...",Nisku AB,https://t.co/n63s3lApnK,
824443214937497600,https://pbs.twimg.com/profile_images/857030758...,BLISSMagAlberta,"{'followers_count': 462, 'following_count': 13...",Attain total BLISS - articles featuring #beer ...,False,2017-01-26 02:25:54+00:00,False,BLISS Magazine,8.877268e+17,"{'url': {'urls': [{'start': 0, 'end': 23, 'url...","Alberta, Canada",https://t.co/Xfthprl0Ky,
857648235585740800,https://pbs.twimg.com/profile_images/857651838...,Mizzboogz,"{'followers_count': 29, 'following_count': 67,...","Passionate about life and love , lgbt and prou...",False,2017-04-27 17:30:48+00:00,False,Ashley Salazar,,"{'description': {'mentions': [{'start': 80, 'e...","High River, Alberta",,
2767912537,https://pbs.twimg.com/profile_images/781211374...,BigAlsSmoke,"{'followers_count': 23717, 'following_count': ...",Big Al's Smoke & Gifts is the #1 premier smoke...,False,2014-08-25 20:56:23+00:00,False,Big Al's Smoke,,"{'url': {'urls': [{'start': 0, 'end': 23, 'url...","Berkeley, CA",https://t.co/Xeb2pE1Luq,


In [28]:
# read the data
locations = read_locations(os.path.join(DATA_PATH, 'followers_2016_rehydrated.jsonl'))
# initialize the geolocator service
geolocator = Nominatim(user_agent="twitter_poll_bio_geocoding_v0.0.1")


In [34]:
unique_locations = set(locations.location_str.unique())

In [47]:
resolved = dict()
tested = set()
timed_out = set()

In [48]:
for location in tqdm(unique_locations):
    if location in tested:
        continue
    try:
        resolved[location] = get_location(location, geolocator)
    except GeocoderTimedOut as e:
        timed_out.add(location)
    finally:
        tested.add(location)

100%|██████████| 11456/11456 [4:53:29<00:00,  1.54s/it]  


In [52]:
# save data
with open(os.path.join(DATA_PATH, 'resolved.pkl'), 'wb+') as f:
    pickle.dump(resolved, f)
with open(os.path.join(DATA_PATH, 'tested.pkl'), 'wb+') as f:
    pickle.dump(tested, f)
with open(os.path.join(DATA_PATH, 'timed_out.pkl'), 'wb+') as f:
    pickle.dump(timed_out, f)

In [53]:
len(timed_out)

85

In [56]:
resolved_reverse = dict()
tested_reverse = set()
timed_out_reverse = set()

In [58]:
for location, location_obj in tqdm(resolved.items()):
    if location in tested_reverse:
        continue
    if not location_obj:
        continue
    try:
        resolved_reverse[location] = reverse_location(location_obj, geolocator)
    except GeocoderTimedOut as e:
        timed_out_reverse.add(location)
    finally:
        tested_reverse.add(location)

# save data
with open(os.path.join(DATA_PATH, 'resolved_reverse.pkl'), 'wb+') as f:
    pickle.dump(resolved_reverse, f)
with open(os.path.join(DATA_PATH, 'tested_reverse.pkl'), 'wb+') as f:
    pickle.dump(tested_reverse, f)
with open(os.path.join(DATA_PATH, 'timed_out_reverse.pkl'), 'wb+') as f:
    pickle.dump(timed_out_reverse, f)

100%|██████████| 11371/11371 [2:12:52<00:00,  1.43it/s] 


In [60]:
locations['location_obj'] = locations.location_str.map(resolved)

In [61]:
# # apply the geolocator
# locations['location_obj'] = locations.location_str.apply(partial(get_location, geolocator=geolocator))

Not everything finds a match on Nominatim, esp. when there are natural language descriptions.

Perhaps we could give it a pass of NER for GEO entities first

In [62]:
locations[locations.location_obj.isna()]

Unnamed: 0_level_0,profile_image_url,username,public_metrics,description,protected,created_at,verified,name,pinned_tweet_id,entities,location_str,url,withheld,location_obj
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
751961088829165568,https://pbs.twimg.com/profile_images/751970961...,Ariamjen31,"{'followers_count': 1112, 'following_count': 1...",,False,2016-07-10 02:07:49+00:00,False,𝐀𝐫𝐢𝐚𝐦@𝟎𝟓,1.564578e+18,,Near in the Ocean🌊,,,
64699857,https://pbs.twimg.com/profile_images/138706008...,writebore,"{'followers_count': 230, 'following_count': 85...",Readingofnovelsshortstoriesnewspapersandmagazi...,False,2009-08-11 12:29:27+00:00,False,David Sheahan,8.358071e+17,,Dedhill,,,
66807863,https://pbs.twimg.com/profile_images/158794761...,malachiobrien,"{'followers_count': 42883, 'following_count': ...",FASTING| PRAYER-🏃Unofficial Guinness World Rec...,False,2009-08-18 21:30:31+00:00,False,DrMalachi.eth - (Dr. Run) - Marathon Every Day,,,Semi-Professional Ultra Runner,,,
18149467,https://pbs.twimg.com/profile_images/157335018...,paulcastain,"{'followers_count': 107574, 'following_count':...",Builder Of Unstoppable Sales Teams. Sales Trai...,False,2008-12-15 23:01:06+00:00,False,Paul Castain,1.501960e+18,"{'url': {'urls': [{'start': 0, 'end': 23, 'url...",In A Sales Bullpen Near You!,https://t.co/lAEMwIKJ8J,,
856527548,https://pbs.twimg.com/profile_images/131668088...,AdriaanG_LP,"{'followers_count': 66643, 'following_count': ...",I’m an Activist for: Improved Leadership @Thin...,False,2012-10-01 15:12:12+00:00,False,Adriaan Groenewald,,"{'url': {'urls': [{'start': 0, 'end': 23, 'url...",Global - in the digital age,https://t.co/mxJvL6dNIK,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
769432417786400768,https://pbs.twimg.com/profile_images/147385500...,Huskey1A,"{'followers_count': 913, 'following_count': 16...","Put your FAITH in GOD and his son, our LORD an...",False,2016-08-27 07:12:38+00:00,False,HuskeyDog,,"{'description': {'hashtags': [{'start': 70, 'e...","No Mandates, Missouri",,,
133330131,https://pbs.twimg.com/profile_images/142134646...,777CHACO,"{'followers_count': 1606, 'following_count': 1...","I 'm World Traveler, I see the World with the ...",False,2010-04-15 14:48:12+00:00,False,CHACO,1.458844e+18,,Honolulu Hawaii /The Earth,,,
560555551,https://pbs.twimg.com/profile_images/115405474...,organicdot,"{'followers_count': 2595, 'following_count': 5...",Organic food and natural health advocate Veget...,False,2012-04-22 19:54:10+00:00,False,Dorothy Dent,,,California land o’fruits &nuts,,,
2988888533,https://pbs.twimg.com/profile_images/557760155...,WaxSpecials,"{'followers_count': 5858, 'following_count': 1...","What can I say? I LOVE WEED, MUSIC & CRYPTO 😍",False,2015-01-21 04:39:14+00:00,False,Wax Specials,,,"Bay Area, Cali",,,


The information returned by the `geocode` function is quite limited, because it tries to fit the specific string.
For example, states are rarely included in the object. Reverse geocoding helps with generalizing in this case.

In [65]:
locations['location_obj_reversed'] = locations.location_str.map(resolved_reverse)

In [63]:
# # apply the reverse geolocator
# locations['location_obj_reversed'] = locations.location_obj.apply(partial(reverse_location, geolocator=geolocator))

In [66]:
# unpack address information
addresses = pd.DataFrame(locations.location_obj_reversed.dropna().apply(lambda x:pd.Series(x.raw['address'])))
addresses.head()

Unnamed: 0_level_0,county,state,ISO3166-2-lvl4,country,country_code,road,neighbourhood,city,postcode,amenity,...,archipelago,subward,ward,square,allotments,landuse,community,zone,ISO3166-2-lvl10,banner
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1599787232,Decatur County,Kansas,US-KS,United States,us,,,,,,...,,,,,,,,,,
1549733678233837568,Greater Bridgeport COG,Connecticut,US-CT,United States,us,Governor John Davis Lodge Turnpike,Bassickville Historic District,Bridgeport,6601.0,,...,,,,,,,,,,
1586767903973511168,Riverside County,California,US-CA,United States,us,Kalmia Street,,Murrieta,92562.0,,...,,,,,,,,,,
1578080496906641408,Adams County,Colorado,US-CO,United States,us,Civic Center Drive,,Thornton,80229.0,Thornton Civic Center,...,,,,,,,,,,
1550149641823469568,Decatur County,Kansas,US-KS,United States,us,,,,,,...,,,,,,,,,,


In [67]:
# merge with the original dataframe
locations = pd.merge(locations, addresses, left_index=True, right_index=True)

In [68]:
locations[['location_str', 'country_code']]

Unnamed: 0_level_0,location_str,country_code
id,Unnamed: 1_level_1,Unnamed: 2_level_1
10285,"Orlando, FL",us
451943,Perugia - Italy,it
614543,Biella - Italy,it
687203,London-ish,al
785595,"Dallas, Texas",us
...,...,...
1613973600650027008,"Fresno, CA",us
1613993985361727488,"Bandung, Jawa Barat",id
1614036014535680000,Egypt,eg
1614049376480935936,ATX,kz


some errors appear bizarre -- hard to correct

In [71]:
locations.loc[687203].location_obj.raw

{'place_id': 52010716,
 'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. https://osm.org/copyright',
 'osm_type': 'node',
 'osm_id': 4602247639,
 'boundingbox': ['41.3311355', '41.3312355', '19.8173253', '19.8174253'],
 'lat': '41.3311855',
 'lon': '19.8173753',
 'display_name': 'London, Bulevardi Zogu i Parë, Qendra, Njësia Bashkiake Nr. 9, Tiranë, Bashkia Tiranë, Qarku i Tiranës, Shqipëria Qendrore, 1005, Shqipëria',
 'class': 'amenity',
 'type': 'cafe',
 'importance': 0.11000999999999997,
 'icon': 'https://nominatim.openstreetmap.org/ui/mapicons/food_cafe.p.20.png'}

In [75]:
def latlong(location_obj):
    try:
        return pd.Series({'lat':location_obj.raw['lat'], 'lon':location_obj.raw['lon']})
    except AttributeError as e:
        return pd.Series({'lat':None, 'lon':None})

locations = pd.merge(locations, locations.location_obj.apply(latlong), left_index=True, right_index=True)

In [76]:
# save data
os.makedirs(DATA_PATH, exist_ok=True)
locations[['location_str', 'country_code']].to_csv(os.path.join(DATA_PATH, 'bio_country_codes.csv'))
locations[['location_str', 'railway',
           'road', 'suburb', 'borough', 'city', 'ISO3166-2-lvl4', 'postcode',
           'country', 'country_code', 'village', 'municipality', 'county',
           'ISO3166-2-lvl6', 'state', 'region', 'subdistrict', 'house_number',
           'tourism', 'neighbourhood', 'ISO3166-2-lvl8', 'state_district',
           'quarter', 'city_district', 'lat', 'lon']].to_csv(os.path.join(DATA_PATH, 'bio_locations.csv'))
locations.to_pickle(os.path.join(DATA_PATH, 'bio_locations.pkl'))
